[PATCH] powerpc/process: fix altivec SPR not being saved

2016-03-06 Thread Oliver O'Halloran
In save_sprs() in process.c contains the following test:

if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC)))
t->vrsave = mfspr(SPRN_VRSAVE);

CPU feature with the mask 0x1 is CPU_FTR_COHERENT_ICACHE so the test
is equivilent to:

if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
cpu_has_feature(CPU_FTR_COHERENT_ICACHE))

On CPUs without support for both (i.e G5) this results in vrsave not being
saved between context switches. The vector register save/restore code
doesn't use VRSAVE to determine which registers to save/restore,
but the value of VRSAVE is used to determine if altivec is being used
in several code paths.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 8224852..5a4d4d1 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -855,7 +855,7 @@ void restore_tm_state(struct pt_regs *regs)
 static inline void save_sprs(struct thread_struct *t)
 {
 #ifdef CONFIG_ALTIVEC
-   if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC)))
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
t->vrsave = mfspr(SPRN_VRSAVE);
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/process: fix altivec SPR not being saved

2016-03-07 Thread Oliver O'Halloran
In save_sprs() in process.c contains the following test:

if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC)))
t->vrsave = mfspr(SPRN_VRSAVE);

CPU feature with the mask 0x1 is CPU_FTR_COHERENT_ICACHE so the test
is equivilent to:

if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
cpu_has_feature(CPU_FTR_COHERENT_ICACHE))

On CPUs without support for both (i.e G5) this results in vrsave not being
saved between context switches. The vector register save/restore code
doesn't use VRSAVE to determine which registers to save/restore,
but the value of VRSAVE is used to determine if altivec is being used
in several code paths.

Signed-off-by: Oliver O'Halloran 
Signed-off-by: Anton Blanchard 
Fixes: 152d523e6307 ("powerpc: Create context switch helpers save_sprs() and 
restore_sprs()")
Cc: sta...@vger.kernel.org
---
 arch/powerpc/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index dccc87e8fee5..bc6aa87a3b12 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -854,7 +854,7 @@ void restore_tm_state(struct pt_regs *regs)
 static inline void save_sprs(struct thread_struct *t)
 {
 #ifdef CONFIG_ALTIVEC
-   if (cpu_has_feature(cpu_has_feature(CPU_FTR_ALTIVEC)))
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
t->vrsave = mfspr(SPRN_VRSAVE);
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/2] powerpc/timer - large decrementer support

2016-04-11 Thread Oliver O'Halloran
POWER ISA v3 adds large decrementer (LD) mode of operation which increases
the size of the decrementer register from 32 bits to an implementation
defined with of up to 64 bits.

This patch adds support for the LD on processors with the CPU_FTR_ARCH_300
cpu feature flag set. Even for CPUs with this feature LD mode is only
enabled when the property ibm,dec-bits devicetree property is supplied
for the boot CPU. The decrementer value is a signed quantity (with
negative values indicating a pending exception) and this property is
required to find the maximum positive decrementer value. If this property
is not supplied then the traditional decrementer width of 32 bits is
assumed and LD mode is disabled.

This patch was based on inital work by Jack Miller.

Signed-off-by: Oliver O'Halloran 
Cc: Jack Miller 
---
 arch/powerpc/include/asm/reg.h  |  1 +
 arch/powerpc/include/asm/time.h |  6 +--
 arch/powerpc/kernel/time.c  | 89 +
 3 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index f5f4c66bbbc9..ff581ed1ab9d 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -332,6 +332,7 @@
 #define   LPCR_AIL_0   0x  /* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3   0x0180  /* MMU on exception offset 0xc00...4xxx 
*/
 #define   LPCR_ONL 0x0004  /* online - PURR/SPURR count */
+#define   LPCR_LD  0x0002  /* large decremeter */
 #define   LPCR_PECE0x0001f000  /* powersave exit cause enable */
 #define LPCR_PECEDP0x0001  /* directed priv dbells cause 
exit */
 #define LPCR_PECEDH0x8000  /* directed hyp dbells cause 
exit */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1092fdd7e737..09211640a0e0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int 
lower)
  * in auto-reload mode.  The problem is PIT stops counting when it
  * hits zero.  If it would wrap, we could use it just like a decrementer.
  */
-static inline unsigned int get_dec(void)
+static inline u64 get_dec(void)
 {
 #if defined(CONFIG_40x)
return (mfspr(SPRN_PIT));
@@ -160,10 +160,10 @@ static inline unsigned int get_dec(void)
  * in when the decrementer generates its interrupt: on the 1 to 0
  * transition for Book E/4xx, but on the 0 to -1 transition for others.
  */
-static inline void set_dec(int val)
+static inline void set_dec(u64 val)
 {
 #if defined(CONFIG_40x)
-   mtspr(SPRN_PIT, val);
+   mtspr(SPRN_PIT, (u32) val);
 #else
 #ifndef CONFIG_BOOKE
--val;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 81b0900a39ee..0afaef6b5b6a 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -95,7 +95,8 @@ static struct clocksource clocksource_timebase = {
.read = timebase_read,
 };
 
-#define DECREMENTER_MAX0x7fff
+#define DECREMENTER_DEFAULT_MAX 0x7FFF
+u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
@@ -503,7 +504,7 @@ static void __timer_interrupt(void)
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
+   if (now <= decrementer_max)
set_dec((int)now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
@@ -534,7 +535,7 @@ void timer_interrupt(struct pt_regs * regs)
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
 */
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 
/* Some implementations of hotplug will get timer interrupts while
 * offline, just ignore these and we also need to set
@@ -562,6 +563,7 @@ void timer_interrupt(struct pt_regs * regs)
irq_enter();
 
__timer_interrupt();
+
irq_exit();
set_irq_regs(old_regs);
 }
@@ -582,9 +584,9 @@ static void generic_suspend_disable_irqs(void)
 * with suspending.
 */
 
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
local_irq_disable();
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 }
 
 static void generic_suspend_enable_irqs(void)
@@ -865,7 +867,7 @@ static int decrementer_set_next_event(unsigned long evt,
 
 static int decrementer_shutdown(struct clock_event_device *dev)
 {
-   decrementer_set_next_event(DECREMENTER_MAX, dev);
+   decrementer_set_next_event(decrementer_max, dev);
return 0;
 }
 
@@ -891,6 +893,72 @@ 

[PATCH 2/2] KVM: PPC: hypervisor large decrementer support

2016-04-11 Thread Oliver O'Halloran
Power ISAv3 extends the width of the decrementer register from 32 bits.
The enlarged register width is implementation dependent, but reads from
these registers are automatically sign extended to produce a 64 bit output
when operating in large mode. The HDEC always operates in large mode
while the DEC register can be operated in 32bit mode or large mode
depending on the setting of the LPCR.LD bit.

Currently the hypervisor assumes that reads from the DEC and HDEC register
produce a 32 bit result which it sign extends to 64 bits using the extsw
instruction. This behaviour can result in the guest DEC register value
being corrupted by the hypervisor when the guest is operating in LD mode since
the results of the extsw instruction only depends on the value of bit
31 in the register to be sign extended.

This patch adds the GET_DEC() and GET_HDEC() assembly macros for reading
from the decrementer registers. These macros will return the current
decrementer value as a 64 bit quantity regardless of the Host CPU or
guest decrementer operating mode. Additionally this patch corrects several
uses of decrementer values that assume a 32 bit register width.

Signed-off-by: Oliver O'Halloran 
Cc: Paul Mackerras 
---
 arch/powerpc/include/asm/exception-64s.h | 22 ++
 arch/powerpc/include/asm/kvm_host.h  |  2 +-
 arch/powerpc/include/asm/kvm_ppc.h   |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h  |  2 +-
 arch/powerpc/kernel/exceptions-64s.S |  9 +++-
 arch/powerpc/kvm/book3s_hv_interrupts.S  |  3 +--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 38 ++--
 arch/powerpc/kvm/emulate.c   |  4 ++--
 8 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h 
b/arch/powerpc/include/asm/exception-64s.h
index 93ae809fe5ea..d922f76c682d 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -545,4 +545,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 #define FINISH_NAP
 #endif
 
+/* these ensure that we always get a 64bit value from the
+ * decrementer register. */
+
+#define IS_LD_ENABLED(reg) \
+   mfspr  reg,SPRN_LPCR;  \
+   andis. reg,reg,(LPCR_LD >> 16);
+
+#define GET_DEC(reg)   \
+   IS_LD_ENABLED(reg);\
+   mfspr reg, SPRN_DEC;   \
+   bne 99f;   \
+   extsw reg, reg;\
+99:
+
+/* For CPUs that support it the Hypervisor LD is
+ * always enabled, so this needs to be feature gated */
+#define GET_HDEC(reg) \
+   mfspr reg, SPRN_HDEC;   \
+BEGIN_FTR_SECTION   \
+   extsw reg, reg; \
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+
 #endif /* _ASM_POWERPC_EXCEPTION_H */
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d7b343170453..6330d3fca083 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -516,7 +516,7 @@ struct kvm_vcpu_arch {
ulong mcsrr0;
ulong mcsrr1;
ulong mcsr;
-   u32 dec;
+   u64 dec;
 #ifdef CONFIG_BOOKE
u32 decar;
 #endif
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 2544edabe7f3..4de0102930e9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -94,7 +94,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
 extern int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
-extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
+extern u64 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(struct kvm_vcpu *vcpu);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index c93cf35ce379..2dd92e841127 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -215,7 +215,7 @@ struct kvm_sregs {
__u32 tsr;  /* KVM_SREGS_E_UPDATE_TSR */
__u32 tcr;
__u32 decar;
-   __u32 dec;  /* KVM_SREGS_E_UPDATE_DEC */
+   __u64 dec;  /* KVM_SREGS_E_UPDATE_DEC */
 
/*
 * Userspace can read TB directly, but the
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 7716cebf4b8e..984ae894e758 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -641,7 +641,14 @@ masked_##_H##interrupt:
\
stb r11,PACAIRQH

[PATCH] Fix fall-through from case 30 (rld*) to case 31

2016-01-24 Thread Oliver O'Halloran
I think this bug can only be triggered if the instruction to
simulate is malformed. The switch in the else case only handles
the zero and one case, but it extracts bits 4:1 from the
instruction word so it may be other values. It's pretty minor, but
a bug is a bug.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/lib/sstep.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index dc885b3..e25f73c 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, 
struct pt_regs *regs,
}
}
 #endif
+   break; /* illegal instruction */
 
case 31:
switch ((instr >> 1) & 0x3ff) {
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/lib/sstep.c - Fix emulation fall-through

2016-02-15 Thread Oliver O'Halloran
There is a switch fallthough in instr_analyze() which can cause
an invalid instruction to be emulated as a different, valid,
instruction. The rld* (opcode 30) case extracts a sub-opcode from
bits 3:1 of the instruction word. However, the only valid values
of this field a 001 and 000. These cases are correctly handled,
but the others are not which causes execution to fall through
into case 31.

Breaking out of the switch causes the instruction to be marked as
unknown and allows the caller to deal with the invalid instruction
in a manner consistent with other invalid instructions.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/lib/sstep.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index dc885b3..e25f73c 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, 
struct pt_regs *regs,
}
}
 #endif
+   break; /* illegal instruction */
 
case 31:
switch ((instr >> 1) & 0x3ff) {
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2] powerpc/sstep.c - Fix emulation fall-through

2016-02-15 Thread Oliver O'Halloran
There is a switch fallthough in instr_analyze() which can cause
an invalid instruction to be emulated as a different, valid,
instruction. The rld* (opcode 30) case extracts a sub-opcode from
bits 3:1 of the instruction word. However, the only valid values
of this field a 001 and 000. These cases are correctly handled,
but the others are not which causes execution to fall through
into case 31.

Breaking out of the switch causes the instruction to be marked as
unknown and allows the caller to deal with the invalid instruction
in a manner consistent with other invalid instructions.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/lib/sstep.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index dc885b3..e25f73c 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, 
struct pt_regs *regs,
}
}
 #endif
+   break; /* illegal instruction */
 
case 31:
switch ((instr >> 1) & 0x3ff) {
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/powernv: Remove ifdef

2019-04-14 Thread Oliver O'Halloran
The PowerNV platform depends on CONFIG_PPC64 so this #ifdef in the middle
of platforms/powernv/pci.c really doesn't need to be there.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index d235c2c..a5b2c7b 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -1039,7 +1039,6 @@ int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, 
int enable)
 }
 EXPORT_SYMBOL_GPL(pnv_pci_set_tunnel_bar);
 
-#ifdef CONFIG_PPC64/* for thread.tidr */
 int pnv_pci_get_as_notify_info(struct task_struct *task, u32 *lpid, u32 *pid,
   u32 *tid)
 {
@@ -1060,7 +1059,6 @@ int pnv_pci_get_as_notify_info(struct task_struct *task, 
u32 *lpid, u32 *pid,
return 0;
 }
 EXPORT_SYMBOL_GPL(pnv_pci_get_as_notify_info);
-#endif
 
 void pnv_pci_shutdown(void)
 {
-- 
2.9.4



Re: [PATCH 3/8] powerpc/eeh: Convert PNV_PHB_FLAG_EEH to global flag

2019-04-18 Thread Oliver O'Halloran
On Wed, 2019-03-20 at 13:58 +1100, Sam Bobroff wrote:
> The PHB flag, PNV_PHB_FLAG_EEH, is set (on PowerNV) individually on
> each PHB once the EEH subsystem is ready. It is the only use of the
> flags member of the phb struct.
> 
> However there is no need to store this separately on each PHB, so
> convert it to a global flag. For symmetry, the flag is now also set
> for pSeries; although it is currently unused it may be useful in the
> future.

I'd drop this patch and keep it as a seperate flag. Making this a
global flag doesn't really buy us anything either. It's also worth
remembering that we do have virtual PHBs, like the NVLink ones, that
don't have real EEH support and we should be doing something more
intelligent about that.

If you are going to remove the PNV_PHB flag then i would look at moving
that flag into the generic pci_controller structure that we use for
tracking PHBs since that would give us a way to toggle EEH support on a
per PHB basis on pseries and powernv.

> Signed-off-by: Sam Bobroff 
> ---
>  arch/powerpc/include/asm/eeh.h   | 11 +++
>  arch/powerpc/platforms/powernv/eeh-powernv.c | 14 +++---
>  arch/powerpc/platforms/powernv/pci.c |  7 +++
>  arch/powerpc/platforms/powernv/pci.h |  2 --
>  arch/powerpc/platforms/pseries/pci.c |  4 
>  5 files changed, 21 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 3613a56281f2..fe4cf7208890 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -43,6 +43,7 @@ struct pci_dn;
>  #define EEH_VALID_PE_ZERO0x10/* PE#0 is valid */
>  #define EEH_ENABLE_IO_FOR_LOG0x20/* Enable IO for log
>  */
>  #define EEH_EARLY_DUMP_LOG   0x40/* Dump log immediately  */
> +#define EEH_PHB_ENABLED  0x80/* PHB recovery uses EEH
>  */
>  
>  /*
>   * Delay for PE reset, all in ms
> @@ -245,6 +246,11 @@ static inline bool eeh_enabled(void)
>   return eeh_has_flag(EEH_ENABLED) && !eeh_has_flag(EEH_FORCE_DISABLED);
>  }
>  
> +static inline bool eeh_phb_enabled(void)
> +{
> + return eeh_has_flag(EEH_PHB_ENABLED);
> +}
> +
>  static inline void eeh_serialize_lock(unsigned long *flags)
>  {
>   raw_spin_lock_irqsave(&confirm_error_lock, *flags);
> @@ -332,6 +338,11 @@ static inline bool eeh_enabled(void)
>  return false;
>  }
>  
> +static inline bool eeh_phb_enabled(void)
> +{
> + return false;
> +}
> +
>  static inline void eeh_probe_devices(void) { }
>  
>  static inline void *eeh_dev_init(struct pci_dn *pdn, void *data)
> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
> b/arch/powerpc/platforms/powernv/eeh-powernv.c
> index 6fc1a463b796..f0a95f663810 100644
> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
> @@ -264,22 +264,14 @@ int pnv_eeh_post_init(void)
>   return ret;
>   }
>  
> - if (!eeh_enabled())
> + if (eeh_enabled())
> + eeh_add_flag(EEH_PHB_ENABLED);
> + else
>   disable_irq(eeh_event_irq);
>  
>   list_for_each_entry(hose, &hose_list, list_node) {
>   phb = hose->private_data;
>  
> - /*
> -  * If EEH is enabled, we're going to rely on that.
> -  * Otherwise, we restore to conventional mechanism
> -  * to clear frozen PE during PCI config access.
> -  */
> - if (eeh_enabled())
> - phb->flags |= PNV_PHB_FLAG_EEH;
> - else
> - phb->flags &= ~PNV_PHB_FLAG_EEH;
> -
>   /* Create debugfs entries */
>  #ifdef CONFIG_DEBUG_FS
>   if (phb->has_dbgfs || !phb->dbgfs)
> diff --git a/arch/powerpc/platforms/powernv/pci.c 
> b/arch/powerpc/platforms/powernv/pci.c
> index 307181fd8a17..d2b50f3bf6b1 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -717,10 +717,9 @@ int pnv_pci_cfg_write(struct pci_dn *pdn,
>  static bool pnv_pci_cfg_check(struct pci_dn *pdn)
>  {
>   struct eeh_dev *edev = NULL;
> - struct pnv_phb *phb = pdn->phb->private_data;
>  
>   /* EEH not enabled ? */
> - if (!(phb->flags & PNV_PHB_FLAG_EEH))
> + if (!eeh_phb_enabled())
>   return true;
>  
>   /* PE reset or device removed ? */
> @@ -761,7 +760,7 @@ static int pnv_pci_read_config(struct pci_bus *bus,
>  
>   ret = pnv_pci_cfg_read(pdn, where, size, val);
>   phb = pdn->phb->private_data;
> - if (phb->flags & PNV_PHB_FLAG_EEH && pdn->edev) {
> + if (eeh_phb_enabled() && pdn->edev) {
>   if (*val == EEH_IO_ERROR_VALUE(size) &&
>   eeh_dev_check_failure(pdn->edev))
>  return PCIBIOS_DEVICE_NOT_FOUND;
> @@ -789,7 +788,7 @@ static int pnv_pci_write_config(struct pci_bus *bus,
>  

Re: [PATCH 5/8] powerpc/eeh: Add eeh_show_enabled()

2019-04-18 Thread Oliver O'Halloran
On Wed, 2019-03-20 at 13:58 +1100, Sam Bobroff wrote:
> Move the EEH enabled message into it's own function so that future
> work can call it from multiple places.
> 
> Signed-off-by: Sam Bobroff 
> ---
>  arch/powerpc/include/asm/eeh.h |  3 +++
>  arch/powerpc/kernel/eeh.c  | 16 +++-
>  2 files changed, 14 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index fe4cf7208890..e217ccda55d0 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -289,6 +289,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
>  
>  struct eeh_dev *eeh_dev_init(struct pci_dn *pdn);
>  void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
> +void eeh_show_enabled(void);
>  void eeh_probe_devices(void);
>  int __init eeh_ops_register(struct eeh_ops *ops);
>  int __exit eeh_ops_unregister(const char *name);
> @@ -338,6 +339,8 @@ static inline bool eeh_enabled(void)
>  return false;
>  }
>  
> +static inline void eeh_show_enabled(void) { }
> +
>  static inline bool eeh_phb_enabled(void)
>  {
>   return false;
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index b14d89547895..3dcff29cb9b3 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -163,6 +163,16 @@ static int __init eeh_setup(char *str)
>  }
>  __setup("eeh=", eeh_setup);
>  
> +void eeh_show_enabled(void)
> +{
> + if (eeh_has_flag(EEH_FORCE_DISABLED))
> + pr_info("EEH: PCI Enhanced I/O Error Handling DISABLED (by 
> eeh=off)\n");

The allcaps looks kind of stupid.

> + else if (eeh_enabled())
> + pr_info("EEH: PCI Enhanced I/O Error Handling ENABLED (capable 
> adapter found)\n");
> + else
> + pr_info("EEH: PCI Enhanced I/O Error Handling DISABLED (no 
> capable adapter found)\n");

If we really have to keep these messages then make it say "no EEH
capable buses found" or something. EEH support has absolutely nothing
to do with the adapters and I'm not even sure we can get the "DISABLED"
message on PowerNV since the root port will always be there.

> +}
> +
>  /*
>   * This routine captures assorted PCI configuration space data
>   * for the indicated PCI device, and puts them into a buffer
> @@ -1166,11 +1176,7 @@ void eeh_probe_devices(void)
>   pdn = hose->pci_data;
>   traverse_pci_dn(pdn, eeh_ops->probe, NULL);
>   }
> - if (eeh_enabled())
> - pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
> - else
> - pr_info("EEH: No capable adapters found\n");
> -
> + eeh_show_enabled();
>  }
>  
>  /**



Re: [PATCH 6/8] powerpc/eeh: Initialize EEH address cache earlier

2019-04-18 Thread Oliver O'Halloran
On Wed, 2019-03-20 at 13:58 +1100, Sam Bobroff wrote:
> The EEH address cache is currently initialized and populated by a
> single function: eeh_addr_cache_build().  While the initial population
> of the cache can only be done once resources are allocated,
> initialization (just setting up a spinlock) could be done much
> earlier.
> 
> So move the initialization step into a separate function and call it
> from a core_initcall (rather than a subsys initcall).
> 

> This will allow future work to make use of the cache during boot time
> PCI scanning.

What's the idea there? Checking for overlaps in the BAR assignments?

> Signed-off-by: Sam Bobroff 
> ---
>  arch/powerpc/include/asm/eeh.h  |  3 +++
>  arch/powerpc/kernel/eeh.c   |  2 ++
>  arch/powerpc/kernel/eeh_cache.c | 13 +++--
>  3 files changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index e217ccda55d0..791b9e6fcc45 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -295,6 +295,7 @@ int __init eeh_ops_register(struct eeh_ops *ops);
>  int __exit eeh_ops_unregister(const char *name);
>  int eeh_check_failure(const volatile void __iomem *token);
>  int eeh_dev_check_failure(struct eeh_dev *edev);
> +void eeh_addr_cache_init(void);
>  void eeh_addr_cache_build(void);
>  void eeh_add_device_early(struct pci_dn *);
>  void eeh_add_device_tree_early(struct pci_dn *);
> @@ -362,6 +363,8 @@ static inline int eeh_check_failure(const volatile void 
> __iomem *token)
>  
>  #define eeh_dev_check_failure(x) (0)
>  
> +static inline void eeh_addr_cache_init(void) { }
> +
>  static inline void eeh_addr_cache_build(void) { }
>  
>  static inline void eeh_add_device_early(struct pci_dn *pdn) { }
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 3dcff29cb9b3..7a406d58d2c0 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -1219,6 +1219,8 @@ static int eeh_init(void)
>   list_for_each_entry_safe(hose, tmp, &hose_list, list_node)
>   eeh_dev_phb_init_dynamic(hose);
>  
> + eeh_addr_cache_init();
> +
>   /* Initialize EEH event */
>   return eeh_event_init();
>  }
> diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
> index 9c68f0837385..f93dd5cf6a39 100644
> --- a/arch/powerpc/kernel/eeh_cache.c
> +++ b/arch/powerpc/kernel/eeh_cache.c
> @@ -267,6 +267,17 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
>   spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
>  }
>  
> +/**
> + * eeh_addr_cache_init - Initialize a cache of I/O addresses
> + *
> + * Initialize a cache of pci i/o addresses.  This cache will be used to
> + * find the pci device that corresponds to a given address.
> + */
> +void eeh_addr_cache_init(void)
> +{
> + spin_lock_init(&pci_io_addr_cache_root.piar_lock);
> +}

You could move this out of the pci_io_addr_cache structure and use
DEFINE_SPINLOCK() too. We might even be able to get rid of the hand-
rolled interval tree in eeh_cache.c in favour of the generic
implementation (see mm/interval_tree.c). I'm pretty sure the generic
one is a drop-in replacement, but I haven't had a chance to have a
detailed look to see if there's any differences in behaviour.

> +
>  /**
>   * eeh_addr_cache_build - Build a cache of I/O addresses
>   *
> @@ -282,8 +293,6 @@ void eeh_addr_cache_build(void)
>   struct eeh_dev *edev;
>   struct pci_dev *dev = NULL;
>  
> - spin_lock_init(&pci_io_addr_cache_root.piar_lock);
> -
>   for_each_pci_dev(dev) {
>   pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
>   if (!pdn)



Re: [PATCH] Add OPAL_GET_SYMBOL / OPAL_LOOKUP_SYMBOL

2020-03-05 Thread Oliver O'Halloran
On Fri, Feb 28, 2020 at 2:09 PM Nicholas Piggin  wrote:
>
> These calls can be used by Linux to annotate BUG addresses with symbols,
> look up symbol addresses in xmon, etc.
>
> This is preferable over having Linux parse the OPAL symbol map itself,
> because OPAL's parsing code already exists for its own symbol printing,
> and it can support other code regions than the skiboot symbols, e.g.,
> the wake-up code in the HOMER (where CPUs have been seen to get stuck).
>
> Signed-off-by: Nicholas Piggin 
> ---
>  core/opal.c |  2 +
>  core/utils.c| 92 +++--
>  doc/opal-api/opal-get-symbol-181.rst| 42 +++
>  doc/opal-api/opal-lookup-symbol-182.rst | 35 ++
>  include/opal-api.h  |  4 +-
>  5 files changed, 168 insertions(+), 7 deletions(-)
>  create mode 100644 doc/opal-api/opal-get-symbol-181.rst
>  create mode 100644 doc/opal-api/opal-lookup-symbol-182.rst
>
> diff --git a/core/opal.c b/core/opal.c
> index d6ff6027b..d9fc4fe05 100644
> --- a/core/opal.c
> +++ b/core/opal.c
> @@ -142,6 +142,8 @@ int64_t opal_entry_check(struct stack_frame *eframe)
> case OPAL_CEC_REBOOT:
> case OPAL_CEC_REBOOT2:
> case OPAL_SIGNAL_SYSTEM_RESET:
> +   case OPAL_GET_SYMBOL:
> +   case OPAL_LOOKUP_SYMBOL:

These names are still awful :|

> break;
> default:
> printf("CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=%04lx 
> cpu @%p -> pir=%04x token=%llu\n",
> diff --git a/core/utils.c b/core/utils.c
> index 8fd63fcb7..5f0d5130b 100644
> --- a/core/utils.c
> +++ b/core/utils.c
> @@ -48,40 +48,120 @@ char __attrconst tohex(uint8_t nibble)
> return __tohex[nibble];
>  }
>
> -static unsigned long get_symbol(unsigned long addr, char **sym, char 
> **sym_end)
> +static unsigned long get_symbol(unsigned long addr, char **sym, char 
> **sym_end, unsigned long *size)
>  {
> unsigned long prev = 0, next;
> char *psym = NULL, *p = __sym_map_start;
>
> *sym = *sym_end = NULL;
> -   while(p < __sym_map_end) {
> +   while (p < __sym_map_end) {
> next = strtoul(p, &p, 16) | SKIBOOT_BASE;
> if (next > addr && prev <= addr) {
> -   p = psym + 3;;
> +   p = psym + 3;
> if (p >= __sym_map_end)
> return 0;
> *sym = p;
> -   while(p < __sym_map_end && *p != 10)
> +   while (p < __sym_map_end && *p != '\n')
> p++;
> *sym_end = p;
> +   *size = next - prev;
> return prev;
> }
> prev = next;
> psym = p;
> -   while(p < __sym_map_end && *p != 10)
> +   while (p < __sym_map_end && *p != '\n')
> p++;
> p++;
> }
> return 0;
>  }
>
> +static unsigned long lookup_symbol(const char *name, unsigned long *size)
> +{
> +   size_t len = strlen(name);
> +   unsigned long addr = 0;
> +   char *sym;
> +   char *p = __sym_map_start;
> +
> +   while (p < __sym_map_end) {
> +   addr = strtoul(p, &p, 16) | SKIBOOT_BASE;
> +   p += 3;
> +   if (p >= __sym_map_end)
> +   return 0;
> +
> +   if (*(p + len) == '\n' && !strncmp(name, p, len)) {
> +   char *sym_end;
> +
> +   if (get_symbol(addr, &sym, &sym_end, size) == 0) {
> +   assert(!strcmp(name, "_end"));
> +   *size = 0;
> +   }
> +
> +   /*
> +* May be more than one symbol at this address but
> +* symbol length calculation should still work in
> +* that case.
> +*/
> +
> +   return addr;
> +   }
> +
> +   while(p < __sym_map_end && *p != '\n')
> +   p++;
> +   p++;
> +   }
> +   return 0;
> +}
> +
> +static int64_t opal_get_symbol(uint64_t addr, __be64 *symaddr, __be64 
> *symsize, char *namebuf, uint64_t buflen)
> +{
> +   unsigned long saddr;
> +   unsigned long ssize;
> +   char *sym, *sym_end;
> +   size_t l;
> +
> +   saddr = get_symbol(addr, &sym, &sym_end, &ssize);
> +   if (!saddr)
> +   return OPAL_RESOURCE;
> +
> +   if (buflen > sym_end - sym)
> +   l = sym_end - sym;
> +   else
> +   l = buflen - 1;
> +   memcpy(namebuf, sym, l);
> +   namebuf[l] = '\0';
> +
> +   *symaddr = cpu_to_be64(saddr);
> +   *symsize = cpu_to_be64(ssize);
> +
> +   

[PATCH v2 1/6] powerpc/eeh: Add sysfs files in late probe

2020-03-05 Thread Oliver O'Halloran
Move creating the EEH specific sysfs files into eeh_add_device_late()
rather than being open-coded all over the place. Calling the function is
generally done immediately after calling eeh_add_device_late() anyway. This
is also a correctness fix since currently the sysfs files will be added
even if the EEH probe happens to fail.

Similarly, on pseries we currently add the sysfs files before calling
eeh_add_device_late(). This is flat-out broken since the sysfs files
require the pci_dev->dev.archdata.edev pointer to be set, and that is done
in eeh_add_device_late().

Reviewed-by: Sam Bobroff 
Signed-off-by: Oliver O'Halloran 
---
v2: Reworded commit message based on Sam Bobroff's comments. About the
current behaviour being broken.
---
 arch/powerpc/include/asm/eeh.h   |  3 ---
 arch/powerpc/kernel/eeh.c| 24 +---
 arch/powerpc/kernel/of_platform.c|  3 ---
 arch/powerpc/kernel/pci-common.c |  3 ---
 arch/powerpc/platforms/powernv/eeh-powernv.c |  1 -
 arch/powerpc/platforms/pseries/eeh_pseries.c |  3 +--
 6 files changed, 2 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 6f9b2a1..5a34907 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -305,7 +305,6 @@ void eeh_add_device_early(struct pci_dn *);
 void eeh_add_device_tree_early(struct pci_dn *);
 void eeh_add_device_late(struct pci_dev *);
 void eeh_add_device_tree_late(struct pci_bus *);
-void eeh_add_sysfs_files(struct pci_bus *);
 void eeh_remove_device(struct pci_dev *);
 int eeh_unfreeze_pe(struct eeh_pe *pe);
 int eeh_pe_reset_and_recover(struct eeh_pe *pe);
@@ -368,8 +367,6 @@ static inline void eeh_add_device_late(struct pci_dev *dev) 
{ }
 
 static inline void eeh_add_device_tree_late(struct pci_bus *bus) { }
 
-static inline void eeh_add_sysfs_files(struct pci_bus *bus) { }
-
 static inline void eeh_remove_device(struct pci_dev *dev) { }
 
 #define EEH_POSSIBLE_ERROR(val, type) (0)
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 17cb3e9..0878912 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1210,6 +1210,7 @@ void eeh_add_device_late(struct pci_dev *dev)
dev->dev.archdata.edev = edev;
 
eeh_addr_cache_insert_dev(dev);
+   eeh_sysfs_add_device(dev);
 }
 
 /**
@@ -1238,29 +1239,6 @@ void eeh_add_device_tree_late(struct pci_bus *bus)
 EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
 
 /**
- * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus
- * @bus: PCI bus
- *
- * This routine must be used to add EEH sysfs files for PCI
- * devices which are attached to the indicated PCI bus. The PCI bus
- * is added after system boot through hotplug or dlpar.
- */
-void eeh_add_sysfs_files(struct pci_bus *bus)
-{
-   struct pci_dev *dev;
-
-   list_for_each_entry(dev, &bus->devices, bus_list) {
-   eeh_sysfs_add_device(dev);
-   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-   struct pci_bus *subbus = dev->subordinate;
-   if (subbus)
-   eeh_add_sysfs_files(subbus);
-   }
-   }
-}
-EXPORT_SYMBOL_GPL(eeh_add_sysfs_files);
-
-/**
  * eeh_remove_device - Undo EEH setup for the indicated pci device
  * @dev: pci device to be removed
  *
diff --git a/arch/powerpc/kernel/of_platform.c 
b/arch/powerpc/kernel/of_platform.c
index 427fc22..cb68800 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -86,9 +86,6 @@ static int of_pci_phb_probe(struct platform_device *dev)
/* Add probed PCI devices to the device model */
pci_bus_add_devices(phb->bus);
 
-   /* sysfs files should only be added after devices are added */
-   eeh_add_sysfs_files(phb->bus);
-
return 0;
 }
 
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index c6c0341..3d2b1cf 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1404,9 +1404,6 @@ void pcibios_finish_adding_to_bus(struct pci_bus *bus)
 
/* Add new devices to global lists.  Register in proc, sysfs. */
pci_bus_add_devices(bus);
-
-   /* sysfs files should only be added after devices are added */
-   eeh_add_sysfs_files(bus);
 }
 EXPORT_SYMBOL_GPL(pcibios_finish_adding_to_bus);
 
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6f300ab..ef727ec 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -48,7 +48,6 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
dev_dbg(&pdev->dev, "EEH: Setting up device\n");
eeh_add_device_early(pdn);
eeh_add_device_late(pdev);
-   eeh_sysfs_add_device(pdev);
 }
 
 sta

[PATCH v2 2/6] powerpc/eeh: Remove eeh_add_device_tree_late()

2020-03-05 Thread Oliver O'Halloran
On pseries and PowerNV pcibios_bus_add_device() calls eeh_add_device_late()
so there's no need to do a separate tree traversal to bind the eeh_dev and
pci_dev together setting up the PHB at boot. As a result we can remove
eeh_add_device_tree_late().

Reviewed-by: Sam Bobroff 
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/include/asm/eeh.h|  3 ---
 arch/powerpc/kernel/eeh.c | 25 -
 arch/powerpc/kernel/of_platform.c |  3 ---
 arch/powerpc/kernel/pci-common.c  |  3 ---
 4 files changed, 34 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 5a34907..5d10781 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -304,7 +304,6 @@ void eeh_addr_cache_init(void);
 void eeh_add_device_early(struct pci_dn *);
 void eeh_add_device_tree_early(struct pci_dn *);
 void eeh_add_device_late(struct pci_dev *);
-void eeh_add_device_tree_late(struct pci_bus *);
 void eeh_remove_device(struct pci_dev *);
 int eeh_unfreeze_pe(struct eeh_pe *pe);
 int eeh_pe_reset_and_recover(struct eeh_pe *pe);
@@ -365,8 +364,6 @@ static inline void eeh_add_device_tree_early(struct pci_dn 
*pdn) { }
 
 static inline void eeh_add_device_late(struct pci_dev *dev) { }
 
-static inline void eeh_add_device_tree_late(struct pci_bus *bus) { }
-
 static inline void eeh_remove_device(struct pci_dev *dev) { }
 
 #define EEH_POSSIBLE_ERROR(val, type) (0)
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 0878912..9cb3370 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1214,31 +1214,6 @@ void eeh_add_device_late(struct pci_dev *dev)
 }
 
 /**
- * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI 
bus
- * @bus: PCI bus
- *
- * This routine must be used to perform EEH initialization for PCI
- * devices which are attached to the indicated PCI bus. The PCI bus
- * is added after system boot through hotplug or dlpar.
- */
-void eeh_add_device_tree_late(struct pci_bus *bus)
-{
-   struct pci_dev *dev;
-
-   if (eeh_has_flag(EEH_FORCE_DISABLED))
-   return;
-   list_for_each_entry(dev, &bus->devices, bus_list) {
-   eeh_add_device_late(dev);
-   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-   struct pci_bus *subbus = dev->subordinate;
-   if (subbus)
-   eeh_add_device_tree_late(subbus);
-   }
-   }
-}
-EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
-
-/**
  * eeh_remove_device - Undo EEH setup for the indicated pci device
  * @dev: pci device to be removed
  *
diff --git a/arch/powerpc/kernel/of_platform.c 
b/arch/powerpc/kernel/of_platform.c
index cb68800..64edac81 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -80,9 +80,6 @@ static int of_pci_phb_probe(struct platform_device *dev)
 */
pcibios_claim_one_bus(phb->bus);
 
-   /* Finish EEH setup */
-   eeh_add_device_tree_late(phb->bus);
-
/* Add probed PCI devices to the device model */
pci_bus_add_devices(phb->bus);
 
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 3d2b1cf..8983afa 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1399,9 +1399,6 @@ void pcibios_finish_adding_to_bus(struct pci_bus *bus)
pci_assign_unassigned_bus_resources(bus);
}
 
-   /* Fixup EEH */
-   eeh_add_device_tree_late(bus);
-
/* Add new devices to global lists.  Register in proc, sysfs. */
pci_bus_add_devices(bus);
 }
-- 
2.9.5



[PATCH v2 3/6] powerpc/eeh: Do early EEH init only when required

2020-03-05 Thread Oliver O'Halloran
The pci hotplug helper (pci_hp_add_devices()) calls
eeh_add_device_tree_early() to scan the device-tree for new PCI devices and
do the early EEH probe before the device is scanned. This early probe is a
no-op in a lot of cases because:

a) The early init is only required to satisfy a PAPR requirement that EEH
   be configured before we start doing config accesses. On PowerNV it is
   a no-op.

b) It's a no-op for devices that have already had their eeh_dev
   initialised.

There are four callers of pci_hp_add_devices():

1. arch/powerpc/kernel/eeh_driver.c
Here the hotplug helper is called when re-scanning pci_devs that
were removed during an EEH recovery pass. The EEH stat for each
removed device (the eeh_dev) is retained across a recovery pass
so the early init is a no-op in this case.

2. drivers/pci/hotplug/pnv_php.c
This is also a no-op since the PowerNV hotplug driver is, suprisingly,
PowerNV specific.

3. drivers/pci/hotplug/rpaphp_core.c
4. drivers/pci/hotplug/rpaphp_pci.c
In these two cases new devices have been hotplugged and FW has
provided new DT nodes for each. These are the only two cases where
the EEH we might have new PCI device nodes in the DT so these are
the only two cases where the early EEH probe needs to be done.

We can move the calls to eeh_add_device_tree_early() to the locations where
it's needed and remove it from the generic path. This is preparation for
making the early EEH probe pseries specific.

Reviewed-by: Sam Bobroff 
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/pci-hotplug.c | 2 --
 drivers/pci/hotplug/rpaphp_core.c | 2 ++
 drivers/pci/hotplug/rpaphp_pci.c  | 4 +++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/pci-hotplug.c 
b/arch/powerpc/kernel/pci-hotplug.c
index d6a67f8..bf83f76 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -112,8 +112,6 @@ void pci_hp_add_devices(struct pci_bus *bus)
struct pci_controller *phb;
struct device_node *dn = pci_bus_to_OF_node(bus);
 
-   eeh_add_device_tree_early(PCI_DN(dn));
-
phb = pci_bus_to_host(bus);
 
mode = PCI_PROBE_NORMAL;
diff --git a/drivers/pci/hotplug/rpaphp_core.c 
b/drivers/pci/hotplug/rpaphp_core.c
index e408e40..9c1e43e 100644
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c
@@ -494,6 +494,8 @@ static int enable_slot(struct hotplug_slot *hotplug_slot)
return retval;
 
if (state == PRESENT) {
+   eeh_add_device_tree_early(PCI_DN(slot->dn));
+
pci_lock_rescan_remove();
pci_hp_add_devices(slot->bus);
pci_unlock_rescan_remove();
diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c
index beca61b..61ebbd8 100644
--- a/drivers/pci/hotplug/rpaphp_pci.c
+++ b/drivers/pci/hotplug/rpaphp_pci.c
@@ -95,8 +95,10 @@ int rpaphp_enable_slot(struct slot *slot)
return -EINVAL;
}
 
-   if (list_empty(&bus->devices))
+   if (list_empty(&bus->devices)) {
+   eeh_add_device_tree_early(PCI_DN(slot->dn));
pci_hp_add_devices(bus);
+   }
 
if (!list_empty(&bus->devices)) {
slot->state = CONFIGURED;
-- 
2.9.5



[PATCH v2 4/6] powerpc/eeh: Remove PHB check in probe

2020-03-05 Thread Oliver O'Halloran
This check for a missing PHB has existing in various forms since the
initial PPC64 port was upstreamed in 2002. The idea seems to be that we
need to guard against creating pci-specific data structures for the non-pci
children of a PCI device tree node (e.g. USB devices). However, we only
create pci_dn structures for DT nodes that correspond to PCI devices so
there's not much point in doing this check in the eeh_probe path.

Reviewed-by: Sam Bobroff 
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/eeh.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9cb3370..a9e4ca7 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1120,7 +1120,6 @@ core_initcall_sync(eeh_init);
  */
 void eeh_add_device_early(struct pci_dn *pdn)
 {
-   struct pci_controller *phb = pdn ? pdn->phb : NULL;
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
 
if (!edev)
@@ -1129,11 +1128,6 @@ void eeh_add_device_early(struct pci_dn *pdn)
if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE))
return;
 
-   /* USB Bus children of PCI devices will not have BUID's */
-   if (NULL == phb ||
-   (eeh_has_flag(EEH_PROBE_MODE_DEVTREE) && 0 == phb->buid))
-   return;
-
eeh_ops->probe(pdn, NULL);
 }
 
-- 
2.9.5



[PATCH v2 5/6] powerpc/eeh: Make early EEH init pseries specific

2020-03-05 Thread Oliver O'Halloran
The eeh_ops->probe() function is called from two different contexts:

1. On pseries, where we set EEH_PROBE_MODE_DEVTREE, it's called in
   eeh_add_device_early() which is supposed to run before we create
   a pci_dev.

2. On PowerNV, where we set EEH_PROBE_MODE_DEV, it's called in
   eeh_device_add_late() which is supposed to run *after* the
   pci_dev is created.

The "early" probe is required because PAPR requires that we perform an RTAS
call to enable EEH support on a device before we start interacting with it
via config space or MMIO. This requirement doesn't exist on PowerNV and
shoehorning two completely separate initialisation paths into a common
interface just results in a convoluted code everywhere.

Additionally the early probe requires the probe function to take an pci_dn
rather than a pci_dev argument. We'd like to make pci_dn a pseries specific
data structure since there's no real requirement for them on PowerNV. To
help both goals move the early probe into the pseries containment zone
so the platform depedence is more explicit.

Reviewed-by: Sam Bobroff 
Signed-off-by: Oliver O'Halloran 
---
v2: s/set set/we set/ in the commit message
---
 arch/powerpc/include/asm/eeh.h   | 14 +++---
 arch/powerpc/kernel/eeh.c| 46 
 arch/powerpc/kernel/of_platform.c|  6 +--
 arch/powerpc/platforms/powernv/eeh-powernv.c |  6 ---
 arch/powerpc/platforms/pseries/eeh_pseries.c | 65 ++--
 arch/powerpc/platforms/pseries/pci_dlpar.c   |  2 +-
 drivers/pci/hotplug/rpadlpar_core.c  |  2 +-
 drivers/pci/hotplug/rpaphp_core.c|  2 +-
 drivers/pci/hotplug/rpaphp_pci.c |  2 +-
 9 files changed, 64 insertions(+), 81 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 5d10781..8580238 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -301,8 +301,6 @@ int __exit eeh_ops_unregister(const char *name);
 int eeh_check_failure(const volatile void __iomem *token);
 int eeh_dev_check_failure(struct eeh_dev *edev);
 void eeh_addr_cache_init(void);
-void eeh_add_device_early(struct pci_dn *);
-void eeh_add_device_tree_early(struct pci_dn *);
 void eeh_add_device_late(struct pci_dev *);
 void eeh_remove_device(struct pci_dev *);
 int eeh_unfreeze_pe(struct eeh_pe *pe);
@@ -358,10 +356,6 @@ static inline int eeh_check_failure(const volatile void 
__iomem *token)
 
 static inline void eeh_addr_cache_init(void) { }
 
-static inline void eeh_add_device_early(struct pci_dn *pdn) { }
-
-static inline void eeh_add_device_tree_early(struct pci_dn *pdn) { }
-
 static inline void eeh_add_device_late(struct pci_dev *dev) { }
 
 static inline void eeh_remove_device(struct pci_dev *dev) { }
@@ -370,6 +364,14 @@ static inline void eeh_remove_device(struct pci_dev *dev) 
{ }
 #define EEH_IO_ERROR_VALUE(size) (-1UL)
 #endif /* CONFIG_EEH */
 
+#if defined(CONFIG_PPC_PSERIES) && defined(CONFIG_EEH)
+void pseries_eeh_init_edev(struct pci_dn *pdn);
+void pseries_eeh_init_edev_recursive(struct pci_dn *pdn);
+#else
+static inline void pseries_eeh_add_device_early(struct pci_dn *pdn) { }
+static inline void pseries_eeh_add_device_tree_early(struct pci_dn *pdn) { }
+#endif
+
 #ifdef CONFIG_PPC64
 /*
  * MMIO read/write operations with EEH support.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index a9e4ca7..55d3ef6 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1107,52 +1107,6 @@ static int eeh_init(void)
 core_initcall_sync(eeh_init);
 
 /**
- * eeh_add_device_early - Enable EEH for the indicated device node
- * @pdn: PCI device node for which to set up EEH
- *
- * This routine must be used to perform EEH initialization for PCI
- * devices that were added after system boot (e.g. hotplug, dlpar).
- * This routine must be called before any i/o is performed to the
- * adapter (inluding any config-space i/o).
- * Whether this actually enables EEH or not for this device depends
- * on the CEC architecture, type of the device, on earlier boot
- * command-line arguments & etc.
- */
-void eeh_add_device_early(struct pci_dn *pdn)
-{
-   struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
-
-   if (!edev)
-   return;
-
-   if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE))
-   return;
-
-   eeh_ops->probe(pdn, NULL);
-}
-
-/**
- * eeh_add_device_tree_early - Enable EEH for the indicated device
- * @pdn: PCI device node
- *
- * This routine must be used to perform EEH initialization for the
- * indicated PCI device that was added after system boot (e.g.
- * hotplug, dlpar).
- */
-void eeh_add_device_tree_early(struct pci_dn *pdn)
-{
-   struct pci_dn *n;
-
-   if (!pdn)
-   return;
-
-   list_for_each_entry(n, &pdn->child_list, list)
-   eeh_add_device_tree_early(n);
-   eeh_add_d

[PATCH v2 6/6] powerpc/eeh: Rework eeh_ops->probe()

2020-03-05 Thread Oliver O'Halloran
With the EEH early probe now being pseries specific there's no need for
eeh_ops->probe() to take a pci_dn. Instead, we can make it take a pci_dev
and use the probe function to map a pci_dev to an eeh_dev. This allows
the platform to implement it's own method for finding (or creating) an
eeh_dev for a given pci_dev which also removes a use of pci_dn in
generic EEH code.

This patch also renames eeh_device_add_late() to eeh_device_probe(). This
better reflects what it does does and removes the last vestiges of the
early/late EEH probe split.

Reviewed-by: Sam Bobroff 
Signed-off-by: Oliver O'Halloran 
---
v2: Fixed the comment block above eeh_probe_device() to use the new
function name.
---
 arch/powerpc/include/asm/eeh.h   |  6 ++--
 arch/powerpc/kernel/eeh.c| 44 +++-
 arch/powerpc/platforms/powernv/eeh-powernv.c | 30 +--
 arch/powerpc/platforms/pseries/eeh_pseries.c | 23 ++-
 4 files changed, 62 insertions(+), 41 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8580238..964a542 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -215,7 +215,7 @@ enum {
 struct eeh_ops {
char *name;
int (*init)(void);
-   void* (*probe)(struct pci_dn *pdn, void *data);
+   struct eeh_dev *(*probe)(struct pci_dev *pdev);
int (*set_option)(struct eeh_pe *pe, int option);
int (*get_pe_addr)(struct eeh_pe *pe);
int (*get_state)(struct eeh_pe *pe, int *delay);
@@ -301,7 +301,7 @@ int __exit eeh_ops_unregister(const char *name);
 int eeh_check_failure(const volatile void __iomem *token);
 int eeh_dev_check_failure(struct eeh_dev *edev);
 void eeh_addr_cache_init(void);
-void eeh_add_device_late(struct pci_dev *);
+void eeh_probe_device(struct pci_dev *pdev);
 void eeh_remove_device(struct pci_dev *);
 int eeh_unfreeze_pe(struct eeh_pe *pe);
 int eeh_pe_reset_and_recover(struct eeh_pe *pe);
@@ -356,7 +356,7 @@ static inline int eeh_check_failure(const volatile void 
__iomem *token)
 
 static inline void eeh_addr_cache_init(void) { }
 
-static inline void eeh_add_device_late(struct pci_dev *dev) { }
+static inline void eeh_probe_device(struct pci_dev *dev) { }
 
 static inline void eeh_remove_device(struct pci_dev *dev) { }
 
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 55d3ef6..7cdcb41 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1107,35 +1107,43 @@ static int eeh_init(void)
 core_initcall_sync(eeh_init);
 
 /**
- * eeh_add_device_late - Perform EEH initialization for the indicated pci 
device
+ * eeh_probe_device() - Perform EEH initialization for the indicated pci device
  * @dev: pci device for which to set up EEH
  *
  * This routine must be used to complete EEH initialization for PCI
  * devices that were added after system boot (e.g. hotplug, dlpar).
  */
-void eeh_add_device_late(struct pci_dev *dev)
+void eeh_probe_device(struct pci_dev *dev)
 {
-   struct pci_dn *pdn;
struct eeh_dev *edev;
 
-   if (!dev)
+   pr_debug("EEH: Adding device %s\n", pci_name(dev));
+
+   /*
+* pci_dev_to_eeh_dev() can only work if eeh_probe_dev() was
+* already called for this device.
+*/
+   if (WARN_ON_ONCE(pci_dev_to_eeh_dev(dev))) {
+   pci_dbg(dev, "Already bound to an eeh_dev!\n");
return;
+   }
 
-   pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
-   edev = pdn_to_eeh_dev(pdn);
-   eeh_edev_dbg(edev, "Adding device\n");
-   if (edev->pdev == dev) {
-   eeh_edev_dbg(edev, "Device already referenced!\n");
+   edev = eeh_ops->probe(dev);
+   if (!edev) {
+   pr_debug("EEH: Adding device failed\n");
return;
}
 
/*
-* The EEH cache might not be removed correctly because of
-* unbalanced kref to the device during unplug time, which
-* relies on pcibios_release_device(). So we have to remove
-* that here explicitly.
+* FIXME: We rely on pcibios_release_device() to remove the
+* existing EEH state. The release function is only called if
+* the pci_dev's refcount drops to zero so if something is
+* keeping a ref to a device (e.g. a filesystem) we need to
+* remove the old EEH state.
+*
+* FIXME: HEY MA, LOOK AT ME, NO LOCKING!
 */
-   if (edev->pdev) {
+   if (edev->pdev && edev->pdev != dev) {
eeh_rmv_from_parent_pe(edev);
eeh_addr_cache_rmv_dev(edev->pdev);
eeh_sysfs_remove_device(edev->pdev);
@@ -1146,17 +1154,11 @@ void eeh_add_device_late(struct pci_dev *dev)
 * into error handler afterwards.
 */
edev->

Re: [PATCHv3 2/2] pseries/scm: buffer pmem's bound addr in dt for kexec kernel

2020-03-12 Thread Oliver O'Halloran
On Wed, Mar 4, 2020 at 7:50 PM Pingfan Liu  wrote:
>
> At present, plpar_hcall(H_SCM_BIND_MEM, ...) takes a very long time, so
> if dumping to fsdax, it will take a very long time.
>
> Take a closer look, during the papr_scm initialization, the only
> configuration is through drc_pmem_bind()-> plpar_hcall(H_SCM_BIND_MEM,
> ...), which helps to set up the bound address.
>
> On pseries, for kexec -l/-p kernel, there is no reset of hardware, and this
> step can be stepped around to save times.  So the pmem bound address can be
> passed to the 2nd kernel through a dynamic added property "bound-addr" in
> dt node 'ibm,pmemory'.
>
> Signed-off-by: Pingfan Liu 
> To: linuxppc-dev@lists.ozlabs.org
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Hari Bathini 
> Cc: Aneesh Kumar K.V 
> Cc: Oliver O'Halloran 
> Cc: Dan Williams 
> Cc: Andrew Donnellan 
> Cc: Christophe Leroy 
> Cc: Rob Herring 
> Cc: Frank Rowand 
> Cc: ke...@lists.infradead.org
> ---
> note: This patch has not been tested since I can not get such a pseries with 
> pmem.
>   Please kindly to give some suggestion, thanks.

There was some qemu patches to implement the Hcall interface floating
around a while ago. I'm not sure they ever made it into upstream qemu
though.

> ---
>  arch/powerpc/platforms/pseries/of_helpers.c |  1 +
>  arch/powerpc/platforms/pseries/papr_scm.c   | 33 
> -
>  drivers/of/base.c   |  1 +
>  3 files changed, 25 insertions(+), 10 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/of_helpers.c 
> b/arch/powerpc/platforms/pseries/of_helpers.c
> index 1022e0f..2c7bab4 100644
> --- a/arch/powerpc/platforms/pseries/of_helpers.c
> +++ b/arch/powerpc/platforms/pseries/of_helpers.c
> @@ -34,6 +34,7 @@ struct property *new_property(const char *name, const int 
> length,
> kfree(new);
> return NULL;
>  }
> +EXPORT_SYMBOL(new_property);
>
>  /**
>   * pseries_of_derive_parent - basically like dirname(1)
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index 0b4467e..54ae903 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -14,6 +14,7 @@
>  #include 
>
>  #include 
> +#include "of_helpers.h"
>
>  #define BIND_ANY_ADDR (~0ul)
>
> @@ -383,7 +384,7 @@ static int papr_scm_probe(struct platform_device *pdev)
>  {
> struct device_node *dn = pdev->dev.of_node;
> u32 drc_index, metadata_size;
> -   u64 blocks, block_size;
> +   u64 blocks, block_size, bound_addr = 0;
> struct papr_scm_priv *p;
> const char *uuid_str;
> u64 uuid[2];
> @@ -440,17 +441,29 @@ static int papr_scm_probe(struct platform_device *pdev)
> p->metadata_size = metadata_size;
> p->pdev = pdev;
>
> -   /* request the hypervisor to bind this region to somewhere in memory 
> */
> -   rc = drc_pmem_bind(p);
> +   of_property_read_u64(dn, "bound-addr", &bound_addr);
> +   if (bound_addr) {
> +   p->bound_addr = bound_addr;
> +   } else {
> +   struct property *property;
> +   u64 big;
>
> -   /* If phyp says drc memory still bound then force unbound and retry */
> -   if (rc == H_OVERLAP)
> -   rc = drc_pmem_query_n_bind(p);
> +   /* request the hypervisor to bind this region to somewhere in 
> memory */
> +   rc = drc_pmem_bind(p);
>
> -   if (rc != H_SUCCESS) {
> -   dev_err(&p->pdev->dev, "bind err: %d\n", rc);
> -   rc = -ENXIO;
> -   goto err;
> +   /* If phyp says drc memory still bound then force unbound and 
> retry */
> +   if (rc == H_OVERLAP)
> +   rc = drc_pmem_query_n_bind(p);
> +
> +   if (rc != H_SUCCESS) {
> +   dev_err(&p->pdev->dev, "bind err: %d\n", rc);
> +   rc = -ENXIO;
> +   goto err;
> +   }
> +   big = cpu_to_be64(p->bound_addr);
> +   property = new_property("bound-addr", sizeof(u64), (const 
> unsigned char *)&big,
> +   NULL);

That should probably be "linux,bound-addr"

The other thing that stands out to me is that you aren't removing the
property when the region is unbound. As a general rule I'd prefer we
didn't hack the DT at runtime, but if we are going to then we s

Re: [PATCH v8 04/14] powerpc/vas: Alloc and setup IRQ and trigger port address

2020-03-23 Thread Oliver O'Halloran
On Mon, Mar 23, 2020 at 8:28 PM Cédric Le Goater  wrote:
>
> On 3/23/20 10:06 AM, Cédric Le Goater wrote:
> > On 3/19/20 7:14 AM, Haren Myneni wrote:
> >>
> >> Alloc IRQ and get trigger port address for each VAS instance. Kernel
> >> register this IRQ per VAS instance and sets this port for each send
> >> window. NX interrupts the kernel when it sees page fault.
> >
> > I don't understand why this is not done by the OPAL driver for each VAS
> > of the system. Is the VAS unit very different from OpenCAPI regarding
> > the fault ?
>
> I checked the previous patchsets and I see that v3 was more like I expected
> it: one interrupt for faults allocated by the skiboot driver and exposed
> in the DT.
>
> What made you change your mind ?

>From init_vas_inst() in arch/powerpc/platforms/powernv/vas.c:

if (pdev->num_resources != 4) {
pr_err("Unexpected DT configuration for [%s, %d]\n",
pdev->name, vasid);
return -ENODEV;
}

This code should never have been written, but here we are. Due to the
above adding an interrupt in the DT makes the driver unable to bind on
older kernels. In an older version of the patches (don't think it was
posted) Haren was using a non-standard interrupt property and we could
work around the problem by going back to that.

However, we already have the OPAL calls for allocating / freeing
hardware interrupt numbers so why not do that? If we ever want to take
advantage of the job completion interrupts we'd want to have the
ability to allocate them since the completion interrupts are
per-window rather than per-VAS.

> This version is hijacking the lowlevel routines of the XIVE irqchip which
> is not the best approach. OCXL is doing that because it needs to allocate
> interrupts for the user space processes using the AFU and we should rework
> that part.

What'd you have in mind for the reworking the oxcl interrupt
allocation? I didn't find it that objectionable since it's more or
less the same as what happens when allocating IPIs.

> However, the translation fault interrupt is allocated by skiboot.
>
> Sorry for the noise, I would like to understand more how this works. I also
> have passthrough in mind.
>
> C.


Re: [PATCH v4 00/25] Add support for OpenCAPI Persistent Memory devices

2020-04-01 Thread Oliver O'Halloran
On Thu, Apr 2, 2020 at 2:42 PM Michael Ellerman  wrote:
>
> "Alastair D'Silva"  writes:
> >> -Original Message-
> >> From: Dan Williams 
> >>
> >> On Sun, Mar 29, 2020 at 10:23 PM Alastair D'Silva 
> >> wrote:
> >> >
> >> > *snip*
> >> Are OPAL calls similar to ACPI DSMs? I.e. methods for the OS to invoke
> >> platform firmware services? What's Skiboot?
> >>
> >
> > Yes, OPAL is the interface to firmware for POWER. Skiboot is the 
> > open-source (and only) implementation of OPAL.
>
>   https://github.com/open-power/skiboot
>
> In particular the tokens for calls are defined here:
>
>   https://github.com/open-power/skiboot/blob/master/include/opal-api.h#L220
>
> And you can grep for the token to find the implementation:
>
>   https://github.com/open-power/skiboot/blob/master/hw/npu2-opencapi.c#L2328

I'm not sure I'd encourage anyone to read npu2-opencapi.c. I find it
hard enough to follow even with access to the workbooks.

There's an OPAL call API reference here:
http://open-power.github.io/skiboot/doc/opal-api/index.html

Oliver


Re: [PATCH v3 1/1] powerpc/kernel: Enables memory hot-remove after reboot on pseries guests

2020-04-02 Thread Oliver O'Halloran
On Fri, Apr 3, 2020 at 6:55 AM Leonardo Bras  wrote:
>
> While providing guests, it's desirable to resize it's memory on demand.
>
> By now, it's possible to do so by creating a guest with a small base
> memory, hot-plugging all the rest, and using 'movable_node' kernel
> command-line parameter, which puts all hot-plugged memory in
> ZONE_MOVABLE, allowing it to be removed whenever needed.
>
> But there is an issue regarding guest reboot:
> If memory is hot-plugged, and then the guest is rebooted, all hot-plugged
> memory goes to ZONE_NORMAL, which offers no guaranteed hot-removal.
> It usually prevents this memory to be hot-removed from the guest.
>
> It's possible to use device-tree information to fix that behavior, as
> it stores flags for LMB ranges on ibm,dynamic-memory-vN.
> It involves marking each memblock with the correct flags as hotpluggable
> memory, which mm/memblock.c puts in ZONE_MOVABLE during boot if
> 'movable_node' is passed.

I don't really understand why the flag is needed at all. According to
PAPR any memory provided by dynamic reconfiguration can be hot-removed
so why aren't we treating all DR memory as hot removable? The only
memory guaranteed to be there 100% of the time is what's in the
/memory@0 node since that's supposed to cover the real mode area.


Re: [PATCH v3 1/1] powerpc/kernel: Enables memory hot-remove after reboot on pseries guests

2020-04-02 Thread Oliver O'Halloran
On Fri, Apr 3, 2020 at 10:07 AM Leonardo Bras  wrote:
>
> Hello Oliver, thank you for the feedback.
> Comments inline:
>
> On Fri, 2020-04-03 at 09:46 +1100, Oliver O'Halloran wrote:
> >
> > I don't really understand why the flag is needed at all. According to
> > PAPR any memory provided by dynamic reconfiguration can be hot-removed
> > so why aren't we treating all DR memory as hot removable? The only
> > memory guaranteed to be there 100% of the time is what's in the
> > /memory@0 node since that's supposed to cover the real mode area.
>
> All LMBs are listed in DR memory, even the base memory.
>
> The v1 of the patch would work this way, as qemu would configure it's
> DR memory with (DRC_INVALID | RESERVED) flags and the hot-added memory
> with (ASSIGNED) flag. Looking for assigned flag would be enough.
>
> But as of today, PowerVM doesn't seem to work that way.
> When you boot a PowerVM virtual machine with Linux, all memory is added
> with the same flags (ASSIGNED).
>
> To create a solution that doesn't break PowerVM, this new flag was made
> necessary.

I'm still not convinced it's necessary. Why not check memory@0 and use
the size as a clip level? Any memory above that level gets marked as
hotpluggable and anything below doesn't. Seems to me that would work
on all current platforms, so what am I missing here?

>
> Best regards,
> Leonardo Bras


Re: [PATCH 1/4] powerpc/eeh: fix pseries_eeh_configure_bridge()

2020-04-02 Thread Oliver O'Halloran
On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote:
> If a device is hot unplgged during EEH recovery, it's possible for the
> RTAS call to ibm,configure-pe in pseries_eeh_configure() to return
> parameter error (-3), however negative return values are not checked
> for and this leads to an infinite loop.
> 
> Fix this by correctly bailing out on negative values.
> 

This should probably be a standalone patch. Looks fine otherwise.

Reviewed-by: Oliver O'Halloran 


> Signed-off-by: Sam Bobroff 
> ---
>  arch/powerpc/platforms/pseries/eeh_pseries.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
> b/arch/powerpc/platforms/pseries/eeh_pseries.c
> index 893ba3f562c4..c4ef03bec0de 100644
> --- a/arch/powerpc/platforms/pseries/eeh_pseries.c
> +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
> @@ -605,7 +605,7 @@ static int pseries_eeh_configure_bridge(struct eeh_pe *pe)
>   config_addr, BUID_HI(pe->phb->buid),
>   BUID_LO(pe->phb->buid));
>  
> - if (!ret)
> + if (ret <= 0)
>   return ret;
>  
>   /*



Re: [PATCH 2/4] powerpc/eeh: Release EEH device state synchronously

2020-04-02 Thread Oliver O'Halloran
On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote:
> EEH device state is currently removed (by eeh_remove_device()) during
> the device release handler, which is invoked as the device's reference
> count drops to zero. This may take some time, or forever, as other
> threads may hold references.
> 
> However, the PCI device state is released synchronously by
> pci_stop_and_remove_bus_device(). This mismatch causes problems, for
> example the device may be re-discovered as a new device before the
> release handler has been called, leaving the PCI and EEH state
> mismatched.
> 
> So instead, call eeh_remove_device() from the bus device removal
> handlers, which are called synchronously in the removal path.
> 
> Signed-off-by: Sam Bobroff 
> ---
>  arch/powerpc/kernel/eeh.c | 26 ++
>  arch/powerpc/kernel/pci-hotplug.c |  2 --
>  2 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 17cb3e9b5697..c36c5a7db5ca 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -1106,6 +1106,32 @@ static int eeh_init(void)
>  
>  core_initcall_sync(eeh_init);
>  
> +static int eeh_device_notifier(struct notifier_block *nb,
> +unsigned long action, void *data)
> +{
> + struct device *dev = data;
> +
> + switch (action) {
> + case BUS_NOTIFY_DEL_DEVICE:
> + eeh_remove_device(to_pci_dev(dev));
> + break;
> + default:
> + break;
> +     }

A comment briefly explaining why we're not doing anything in the add
case might be nice.

Reviewed-by: Oliver O'Halloran 

> + return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block eeh_device_nb = {
> + .notifier_call = eeh_device_notifier,
> +};
> +
> +static __init int eeh_set_bus_notifier(void)
> +{
> + bus_register_notifier(&pci_bus_type, &eeh_device_nb);
> + return 0;
> +}
> +arch_initcall(eeh_set_bus_notifier);
> +
>  /**
>   * eeh_add_device_early - Enable EEH for the indicated device node
>   * @pdn: PCI device node for which to set up EEH
> diff --git a/arch/powerpc/kernel/pci-hotplug.c 
> b/arch/powerpc/kernel/pci-hotplug.c
> index d6a67f814983..28e9aa274f64 100644
> --- a/arch/powerpc/kernel/pci-hotplug.c
> +++ b/arch/powerpc/kernel/pci-hotplug.c
> @@ -57,8 +57,6 @@ void pcibios_release_device(struct pci_dev *dev)
>   struct pci_controller *phb = pci_bus_to_host(dev->bus);
>   struct pci_dn *pdn = pci_get_pdn(dev);
>  
> - eeh_remove_device(dev);
> -
>   if (phb->controller_ops.release_device)
>   phb->controller_ops.release_device(dev);
>  



Re: [PATCH 4/4] powerpc/eeh: Clean up edev cleanup for VFs

2020-04-02 Thread Oliver O'Halloran
On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote:
> Because the bus notifier calls eeh_rmv_from_parent_pe() (via
> eeh_remove_device()) when a VF is removed, the call in
> remove_sriov_vf_pdns() is redundant.

eeh_rmv_from_parent_pe() won't actually remove the device if the
recovering flag is set on the PE. Are you sure we're not introducing a
race here?



Re: [PATCH 3/4] powerpc/eeh: Remove workaround from eeh_add_device_late()

2020-04-02 Thread Oliver O'Halloran
On Mon, 2020-03-30 at 15:56 +1100, Sam Bobroff wrote:
> When EEH device state was released asynchronously by the device
> release handler, it was possible for an outstanding reference to
> prevent it's release and it was necessary to work around that if a
> device was re-discovered at the same PCI location.

I think this is a bit misleading. The main situation where you'll hit
this hack is when recovering a device with a driver that doesn't
implement the error handling callbacks. In that case the device is
removed, reset, then re-probed by the PCI core, but we assume it's the
same physical device so the eeh_device state remains active.

If you actually changed the underlying device I suspect something bad
would happen.

> Now that the state is released synchronously that is no longer
> possible and the workaround is no longer necessary.

You could probably fold this into the previous patch, but eh. You could
probably fold this into the previous patch, but eh.

> Signed-off-by: Sam Bobroff 
> ---
>  arch/powerpc/kernel/eeh.c | 23 +--
>  1 file changed, 1 insertion(+), 22 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index c36c5a7db5ca..12c248a16527 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -1206,28 +1206,7 @@ void eeh_add_device_late(struct pci_dev *dev)
>   eeh_edev_dbg(edev, "Device already referenced!\n");
>   return;
>   }
> -
> - /*
> -  * The EEH cache might not be removed correctly because of
> -  * unbalanced kref to the device during unplug time, which
> -  * relies on pcibios_release_device(). So we have to remove
> -  * that here explicitly.
> -  */
> - if (edev->pdev) {
> - eeh_rmv_from_parent_pe(edev);
> - eeh_addr_cache_rmv_dev(edev->pdev);
> - eeh_sysfs_remove_device(edev->pdev);
> -
> - /*
> -  * We definitely should have the PCI device removed
> -  * though it wasn't correctly. So we needn't call
> -  * into error handler afterwards.
> -  */
> - edev->mode |= EEH_DEV_NO_HANDLER;
> -
> - edev->pdev = NULL;
> - dev->dev.archdata.edev = NULL;
> - }
> + WARN_ON_ONCE(edev->pdev);
>  
>   if (eeh_has_flag(EEH_PROBE_MODE_DEV))
>   eeh_ops->probe(pdn, NULL);



[PATCH 1/7] powerpc/powernv/npu: Clean up compound table group initialisation

2020-04-05 Thread Oliver O'Halloran
Re-work the control flow a bit so what's going on is a little clearer.
This also ensures the table_group is only initialised once in the P9
case. This shouldn't be a functional change since all the GPU PCI
devices should have the same table_group configuration, but it does
look strange.

Cc: Alexey Kardashevskiy 
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 46 +++-
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index b95b9e3c4c98..de617549c9a3 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -427,7 +427,7 @@ static void pnv_comp_attach_table_group(struct npu_comp 
*npucomp,
 
 struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
 {
-   struct iommu_table_group *table_group;
+   struct iommu_table_group *compound_group;
struct npu_comp *npucomp;
struct pci_dev *gpdev = NULL;
struct pci_controller *hose;
@@ -446,36 +446,32 @@ struct iommu_table_group 
*pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
hose = pci_bus_to_host(npdev->bus);
 
if (hose->npu) {
-   table_group = &hose->npu->npucomp.table_group;
-
-   if (!table_group->group) {
-   table_group->ops = &pnv_npu_peers_ops;
-   iommu_register_group(table_group,
-   hose->global_number,
-   pe->pe_number);
-   }
+   /* P9 case: compound group is per-NPU (all gpus, all links) */
+   npucomp = &hose->npu->npucomp;
} else {
-   /* Create a group for 1 GPU and attached NPUs for POWER8 */
-   pe->npucomp = kzalloc(sizeof(*pe->npucomp), GFP_KERNEL);
-   table_group = &pe->npucomp->table_group;
-   table_group->ops = &pnv_npu_peers_ops;
-   iommu_register_group(table_group, hose->global_number,
-   pe->pe_number);
+   /* P8 case: Compound group is per-GPU (1 gpu, 2 links) */
+   npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL);
}
 
-   /* Steal capabilities from a GPU PE */
-   table_group->max_dynamic_windows_supported =
-   pe->table_group.max_dynamic_windows_supported;
-   table_group->tce32_start = pe->table_group.tce32_start;
-   table_group->tce32_size = pe->table_group.tce32_size;
-   table_group->max_levels = pe->table_group.max_levels;
-   if (!table_group->pgsizes)
-   table_group->pgsizes = pe->table_group.pgsizes;
+   compound_group = &npucomp->table_group;
+   if (!compound_group->group) {
+   compound_group->ops = &pnv_npu_peers_ops;
+   iommu_register_group(compound_group, hose->global_number,
+   pe->pe_number);
+
+   /* Steal capabilities from a GPU PE */
+   compound_group->max_dynamic_windows_supported =
+   pe->table_group.max_dynamic_windows_supported;
+   compound_group->tce32_start = pe->table_group.tce32_start;
+   compound_group->tce32_size = pe->table_group.tce32_size;
+   compound_group->max_levels = pe->table_group.max_levels;
+   if (!compound_group->pgsizes)
+   compound_group->pgsizes = pe->table_group.pgsizes;
+   }
 
-   npucomp = container_of(table_group, struct npu_comp, table_group);
pnv_comp_attach_table_group(npucomp, pe);
 
-   return table_group;
+   return compound_group;
 }
 
 struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
-- 
2.21.1



Make PowerNV IOMMU group setup saner (and fix it for hotpug)

2020-04-05 Thread Oliver O'Halloran
Currently on PowerNV the IOMMU group of a device is initialised in
boot-time fixup which runs after devices are probed. Because this is
only run at boot time hotplugged devices do not recieve an iommu group
assignment which prevents them from being passed through to a guest.

This series fixes that by moving the point where IOMMU groups are
registered to when we configure DMA for a PE, and moves the point where
we add a device to the PE's IOMMU group into the per-device DMA setup
callback for IODA phbs (pnv_pci_ioda_dma_dev_setup()). This change means
that we'll do group setup for hotplugged devices and that we can remove
the hack we have for VFs which are currently added to their group
via a bus notifier.

With this change there's no longer any per-device setup that needs to
run in a fixup for ordinary PCI devices. The exception is, as per usual,
NVLink devices. For those the GPU and any of it's NVLink devices need
to be in a "compound" IOMMU group which keeps the DMA address spaces
of each device in sync with it's attached devices. As a result that
setup can only be done when both the NVLink devices and the GPU device
has been probed, so that setup is still done in the fixup. Sucks, but
it's still an improvement.

Boot tested on a witherspoon with 6xGPUs and it didn't crash so it must
be good.




[PATCH 2/7] powerpc/powernv/iov: Don't add VFs to iommu group during PE config

2020-04-05 Thread Oliver O'Halloran
In pnv_ioda_setup_vf_PE() we register an iommu group for the VF PE
then call pnv_ioda_setup_bus_iommu_group() to add devices to that group.
However, this function is called before the VFs are scanned so there's
no devices to add.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 57d3a6af1d52..2c340504fa77 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1622,7 +1622,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
u16 num_vfs)
 #ifdef CONFIG_IOMMU_API
iommu_register_group(&pe->table_group,
pe->phb->hose->global_number, pe->pe_number);
-   pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL);
 #endif
}
 }
-- 
2.21.1



[PATCH 3/7] powerpc/powernv/pci: Register iommu group at PE DMA setup

2020-04-05 Thread Oliver O'Halloran
Move the registration of IOMMU groups out of the post-phb init fixup and
into when we configure DMA for a PE. For most devices this doesn't
result in any functional changes, but for NVLink attached GPUs it
requires a bit of care. When the GPU is probed an IOMMU group would be
created for the PE that contains it. We need to ensure that group is
removed before we add the PE to the compound group that's used to keep
the translations see by the PCIe and NVLink buses the same.

No functional changes. Probably.

Cc: Alexey Kardashevskiy 
Cc: Reza Arbab 
Cc: Alistair Popple 
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/npu-dma.c  |  9 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++--
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index de617549c9a3..4fbbdfa8b327 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -469,6 +469,15 @@ struct iommu_table_group 
*pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
compound_group->pgsizes = pe->table_group.pgsizes;
}
 
+   /*
+   * I'm not sure this is strictly required, but it's probably a good idea
+   * since the table_group for the PE is going to be attached to the
+   * compound table group. If we leave the PE's iommu group active then
+   * we might have the same table_group being modifiable via two sepeate
+   * iommu groups.
+   */
+   iommu_group_put(pe->table_group.group);
+
pnv_comp_attach_table_group(npucomp, pe);
 
return compound_group;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2c340504fa77..cf0aaef1b8fa 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1619,10 +1619,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
u16 num_vfs)
}
 
pnv_pci_ioda2_setup_dma_pe(phb, pe);
-#ifdef CONFIG_IOMMU_API
-   iommu_register_group(&pe->table_group,
-   pe->phb->hose->global_number, pe->pe_number);
-#endif
}
 }
 
@@ -2661,9 +2657,6 @@ static void pnv_pci_ioda_setup_iommu_api(void)
continue;
 
table_group = &pe->table_group;
-   iommu_register_group(&pe->table_group,
-   pe->phb->hose->global_number,
-   pe->pe_number);
}
pnv_ioda_setup_bus_iommu_group(pe, table_group,
pe->pbus);
@@ -2748,14 +2741,17 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
IOMMU_TABLE_GROUP_MAX_TABLES;
pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
-#ifdef CONFIG_IOMMU_API
-   pe->table_group.ops = &pnv_pci_ioda2_ops;
-#endif
 
rc = pnv_pci_ioda2_setup_default_config(pe);
if (rc)
return;
 
+#ifdef CONFIG_IOMMU_API
+   pe->table_group.ops = &pnv_pci_ioda2_ops;
+   iommu_register_group(&pe->table_group, phb->hose->global_number,
+pe->pe_number);
+#endif
+
if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
pnv_ioda_setup_bus_dma(pe, pe->pbus);
 }
-- 
2.21.1



[PATCH 4/7] powerpc/powernv/pci: Add device to iommu group during dma_dev_setup()

2020-04-05 Thread Oliver O'Halloran
Historically adding devices to their respective iommu group has been
handled by the post-init phb fixup for most devices. This was done
because:

1) The IOMMU group is tied to the PE (usually) so we can only setup the
   iommu groups after we've done resource allocation since BAR location
   determines the device's PE, and:
2) The sysfs directory for the pci_dev needs to be available since
   iommu_add_device() wants to add an attribute for the iommu group.

However, since commit 30d87ef8b38d ("powerpc/pci: Fix
pcibios_setup_device() ordering") both conditions are met when
hose->ops->dma_dev_setup() is called so there's no real need to do
this in the fixup.

Moving the call to iommu_add_device() into pnv_pci_ioda_dma_setup_dev()
is a nice cleanup since it puts all the per-device IOMMU setup into one
place. It also results in all (non-nvlink) devices getting their iommu
group via a common path rather than relying on the bus notifier hack
in pnv_tce_iommu_bus_notifier() to handle the adding VFs and
hotplugged devices to their group.

Cc: Alexey Kardashevskiy 
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/npu-dma.c  |  8 
 arch/powerpc/platforms/powernv/pci-ioda.c | 47 +++
 arch/powerpc/platforms/powernv/pci.c  | 20 --
 3 files changed, 21 insertions(+), 54 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 4fbbdfa8b327..df27b8d7e78f 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -469,6 +469,12 @@ struct iommu_table_group 
*pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
compound_group->pgsizes = pe->table_group.pgsizes;
}
 
+   /*
+* The gpu would have been added to the iommu group that's created
+* for the PE. Pull it out now.
+*/
+   iommu_del_device(&gpdev->dev);
+
/*
* I'm not sure this is strictly required, but it's probably a good idea
* since the table_group for the PE is going to be attached to the
@@ -478,7 +484,9 @@ struct iommu_table_group 
*pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
*/
iommu_group_put(pe->table_group.group);
 
+   /* now put the GPU into the compound group */
pnv_comp_attach_table_group(npucomp, pe);
+   iommu_add_device(compound_group, &gpdev->dev);
 
return compound_group;
 }
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index cf0aaef1b8fa..9198b7882b57 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1774,12 +1774,10 @@ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev 
*pdev)
WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
-   /*
-* Note: iommu_add_device() will fail here as
-* for physical PE: the device is already added by now;
-* for virtual PE: sysfs entries are not ready yet and
-* tce_iommu_bus_notifier will add the device to a group later.
-*/
+
+   /* PEs with a DMA weight of zero won't have a group */
+   if (pe->table_group.group)
+   iommu_add_device(&pe->table_group, &pdev->dev);
 }
 
 /*
@@ -2628,39 +2626,20 @@ static void pnv_pci_ioda_setup_iommu_api(void)
struct pnv_ioda_pe *pe;
 
/*
-* There are 4 types of PEs:
-* - PNV_IODA_PE_BUS: a downstream port with an adapter,
-*   created from pnv_pci_setup_bridge();
-* - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it,
-*   created from pnv_pci_setup_bridge();
-* - PNV_IODA_PE_VF: a SRIOV virtual function,
-*   created from pnv_pcibios_sriov_enable();
-* - PNV_IODA_PE_DEV: an NPU or OCAPI device,
-*   created from pnv_pci_ioda_fixup().
+* For non-nvlink devices the IOMMU group is registered when the PE is
+* configured and devices are added to the group when the per-device
+* DMA setup is run. That's done in hose->ops.dma_dev_setup() which is
+* only initialise for "normal" IODA PHBs.
 *
-* Normally a PE is represented by an IOMMU group, however for
-* devices with side channels the groups need to be more strict.
+* For NVLink devices we need to ensure the NVLinks and the GPU end up
+* in the same IOMMU group, so that's handled here.
 */
list_for_each_entry(hose, &hose_list, list_node) {
phb = hose->private_data;
 
-   if (phb->type == PNV_PHB_NPU_NVLINK ||
-   phb->type == PNV_PHB_NPU_

[PATCH 5/7] powerpc/powernv/pci: Delete old iommu recursive iommu setup

2020-04-05 Thread Oliver O'Halloran
No longer used.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 32 ---
 1 file changed, 32 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9198b7882b57..8b45b8e561e9 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1550,11 +1550,6 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
 
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
   struct pnv_ioda_pe *pe);
-#ifdef CONFIG_IOMMU_API
-static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
-   struct iommu_table_group *table_group, struct pci_bus *bus);
-
-#endif
 static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 {
struct pci_bus*bus;
@@ -2590,33 +2585,6 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
.release_ownership = pnv_ioda2_release_ownership,
 };
 
-static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
-   struct iommu_table_group *table_group,
-   struct pci_bus *bus)
-{
-   struct pci_dev *dev;
-
-   list_for_each_entry(dev, &bus->devices, bus_list) {
-   iommu_add_device(table_group, &dev->dev);
-
-   if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
-   pnv_ioda_setup_bus_iommu_group_add_devices(pe,
-   table_group, dev->subordinate);
-   }
-}
-
-static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
-   struct iommu_table_group *table_group, struct pci_bus *bus)
-{
-
-   if (pe->flags & PNV_IODA_PE_DEV)
-   iommu_add_device(table_group, &pe->pdev->dev);
-
-   if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus)
-   pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group,
-   bus);
-}
-
 static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
 
 static void pnv_pci_ioda_setup_iommu_api(void)
-- 
2.21.1



[PATCH 6/7] powerpc/powernv/pci: Move tce size parsing to pci-ioda-tce.c

2020-04-05 Thread Oliver O'Halloran
Move it in with the rest of the TCE wrangling rather than carting around
a static prototype in pci-ioda.c

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 28 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 ---
 arch/powerpc/platforms/powernv/pci.h  |  2 ++
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c 
b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index 5dc6847d5f4c..f923359d8afc 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -17,6 +17,34 @@
 #include 
 #include "pci.h"
 
+unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
+{
+   struct pci_controller *hose = phb->hose;
+   struct device_node *dn = hose->dn;
+   unsigned long mask = 0;
+   int i, rc, count;
+   u32 val;
+
+   count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
+   if (count <= 0) {
+   mask = SZ_4K | SZ_64K;
+   /* Add 16M for POWER8 by default */
+   if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+   !cpu_has_feature(CPU_FTR_ARCH_300))
+   mask |= SZ_16M | SZ_256M;
+   return mask;
+   }
+
+   for (i = 0; i < count; i++) {
+   rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
+   i, &val);
+   if (rc == 0)
+   mask |= 1ULL << val;
+   }
+
+   return mask;
+}
+
 void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
void *tce_mem, u64 tce_size,
u64 dma_offset, unsigned int page_shift)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8b45b8e561e9..c020ade3a846 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2585,8 +2585,6 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
.release_ownership = pnv_ioda2_release_ownership,
 };
 
-static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
-
 static void pnv_pci_ioda_setup_iommu_api(void)
 {
struct pci_controller *hose;
@@ -2638,34 +2636,6 @@ static void pnv_pci_ioda_setup_iommu_api(void)
 static void pnv_pci_ioda_setup_iommu_api(void) { };
 #endif
 
-static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
-{
-   struct pci_controller *hose = phb->hose;
-   struct device_node *dn = hose->dn;
-   unsigned long mask = 0;
-   int i, rc, count;
-   u32 val;
-
-   count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
-   if (count <= 0) {
-   mask = SZ_4K | SZ_64K;
-   /* Add 16M for POWER8 by default */
-   if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
-   !cpu_has_feature(CPU_FTR_ARCH_300))
-   mask |= SZ_16M | SZ_256M;
-   return mask;
-   }
-
-   for (i = 0; i < count; i++) {
-   rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
-   i, &val);
-   if (rc == 0)
-   mask |= 1ULL << val;
-   }
-
-   return mask;
-}
-
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
   struct pnv_ioda_pe *pe)
 {
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index d3bbdeab3a32..0c5845a1f05d 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -244,4 +244,6 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table 
*tbl,
void *tce_mem, u64 tce_size,
u64 dma_offset, unsigned int page_shift);
 
+extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
+
 #endif /* __POWERNV_PCI_H */
-- 
2.21.1



[PATCH 7/7] powerpc/powernv/npu: Move IOMMU group setup into npu-dma.c

2020-04-05 Thread Oliver O'Halloran
The NVlink IOMMU group setup is only relevant to NVLink devices so move
it into the NPU containment zone. This let us remove some prototypes in
pci.h and staticfy some function definitions.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/npu-dma.c  | 54 +++-
 arch/powerpc/platforms/powernv/pci-ioda.c | 60 +++
 arch/powerpc/platforms/powernv/pci.h  |  6 +--
 3 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index df27b8d7e78f..abeaa533b976 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -15,6 +15,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 #include "pci.h"
@@ -425,7 +426,8 @@ static void pnv_comp_attach_table_group(struct npu_comp 
*npucomp,
++npucomp->pe_num;
 }
 
-struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
+static struct iommu_table_group *
+   pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
 {
struct iommu_table_group *compound_group;
struct npu_comp *npucomp;
@@ -491,7 +493,7 @@ struct iommu_table_group 
*pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
return compound_group;
 }
 
-struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
+static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe 
*pe)
 {
struct iommu_table_group *table_group;
struct npu_comp *npucomp;
@@ -534,6 +536,54 @@ struct iommu_table_group *pnv_npu_compound_attach(struct 
pnv_ioda_pe *pe)
 
return table_group;
 }
+
+void pnv_pci_npu_setup_iommu_groups(void)
+{
+   struct pci_controller *hose;
+   struct pnv_phb *phb;
+   struct pnv_ioda_pe *pe;
+
+   /*
+* For non-nvlink devices the IOMMU group is registered when the PE is
+* configured and devices are added to the group when the per-device
+* DMA setup is run. That's done in hose->ops.dma_dev_setup() which is
+* only initialise for "normal" IODA PHBs.
+*
+* For NVLink devices we need to ensure the NVLinks and the GPU end up
+* in the same IOMMU group, so that's handled here.
+*/
+   list_for_each_entry(hose, &hose_list, list_node) {
+   phb = hose->private_data;
+
+   if (phb->type == PNV_PHB_IODA2)
+   list_for_each_entry(pe, &phb->ioda.pe_list, list)
+   pnv_try_setup_npu_table_group(pe);
+   }
+
+   /*
+* Now we have all PHBs discovered, time to add NPU devices to
+* the corresponding IOMMU groups.
+*/
+   list_for_each_entry(hose, &hose_list, list_node) {
+   unsigned long  pgsizes;
+
+   phb = hose->private_data;
+
+   if (phb->type != PNV_PHB_NPU_NVLINK)
+   continue;
+
+   pgsizes = pnv_ioda_parse_tce_sizes(phb);
+   list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+   /*
+* IODA2 bridges get this set up from
+* pci_controller_ops::setup_bridge but NPU bridges
+* do not have this hook defined so we do it here.
+*/
+   pe->table_group.pgsizes = pgsizes;
+   pnv_npu_compound_attach(pe);
+   }
+   }
+}
 #endif /* CONFIG_IOMMU_API */
 
 int pnv_npu2_init(struct pci_controller *hose)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index c020ade3a846..dba0c2c09f61 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1288,7 +1288,7 @@ static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
pnv_ioda_setup_npu_PE(pdev);
 }
 
-static void pnv_pci_ioda_setup_PEs(void)
+static void pnv_pci_ioda_setup_nvlink(void)
 {
struct pci_controller *hose;
struct pnv_phb *phb;
@@ -1312,6 +1312,11 @@ static void pnv_pci_ioda_setup_PEs(void)
list_for_each_entry(pe, &phb->ioda.pe_list, list)
pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
}
+
+#ifdef CONFIG_IOMMU_API
+   /* setup iommu groups so we can do nvlink pass-thru */
+   pnv_pci_npu_setup_iommu_groups();
+#endif
 }
 
 #ifdef CONFIG_PCI_IOV
@@ -2584,56 +2589,6 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
.take_ownership = pnv_ioda2_take_ownership,
.release_ownership = pnv_ioda2_release_ownership,
 };
-
-static void pnv_pci_ioda_setup_iommu_api(void)
-{
-   struct pci_controller *hose;
-   struct pnv_phb *phb;
-   struct pnv_ioda_pe *pe;
-
-   /*
-* For non-nvlink devices the IOMMU group is registered when t

Re: [PATCH v3] powerpc/powernv: add NULL check after kzalloc in opal_add_one_export

2020-04-06 Thread Oliver O'Halloran
On Mon, Apr 6, 2020 at 11:15 AM Qiujun Huang  wrote:
>
> On Mon, Apr 6, 2020 at 3:06 AM Markus Elfring  wrote:
> >
> > > Here needs a NULL check.
> quite obvious?
> >
> > I find this change description questionable
> > (despite of a reasonable patch subject).
> >
> >
> > > Issue found by coccinelle.
> >
> > Would an information like “Generated by: 
> > scripts/coccinelle/null/kmerr.cocci”
> > be nicer?
> Yeah, but I think It was enough.

I didn't know we had that script in the kernel tree so I think it's a
good to mention that you used it. It might even help idiots like me
who write this sort of bug.

> > Will a patch change log be helpful here?
> I realized I should write some change log, and the change log was meaningless.
> So I left it blank.

The changelog is fine IMO. The point of a changelog is to tell a
reader doing git archeology why a change happened and this is
sufficent for that.

Reviewed-by: Oliver O'Halloran 


[PATCH] powerpc/boot: Fix the initrd being overwritten under qemu

2019-10-22 Thread Oliver O'Halloran
When booting under OF the zImage expects the initrd address and size to be
passed to it using registers r3 and r4. SLOF (guest firmware used by QEMU)
currently doesn't do this so the zImage is not aware of the initrd
location.  This can result in initrd corruption either though the zImage
extracting the vmlinux over the initrd, or by the vmlinux overwriting the
initrd when relocating itself.

QEMU does put the linux,initrd-start and linux,initrd-end properties into
the devicetree to vmlinux to find the initrd. We can work around the SLOF
bug by also looking those properties in the zImage.

Cc: sta...@vger.kernel.org
Cc: Alexey Kardashevskiy 
Signed-off-by: Oliver O'Halloran 
---
First noticed here: 
https://unix.stackexchange.com/questions/547023/linux-kernel-on-ppc64le-vmlinux-equivalent-in-arch-powerpc-boot
---
 arch/powerpc/boot/devtree.c | 21 +
 arch/powerpc/boot/main.c|  7 +++
 arch/powerpc/boot/of.h  | 16 
 arch/powerpc/boot/ops.h |  1 +
 arch/powerpc/boot/swab.h| 17 +
 5 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/boot/devtree.c b/arch/powerpc/boot/devtree.c
index 5d91036..ac5c26b 100644
--- a/arch/powerpc/boot/devtree.c
+++ b/arch/powerpc/boot/devtree.c
@@ -13,6 +13,7 @@
 #include "string.h"
 #include "stdio.h"
 #include "ops.h"
+#include "swab.h"
 
 void dt_fixup_memory(u64 start, u64 size)
 {
@@ -318,6 +319,26 @@ int dt_xlate_reg(void *node, int res, unsigned long *addr, 
unsigned long *size)
return dt_xlate(node, res, reglen, addr, size);
 }
 
+int dt_read_addr(void *node, const char *prop, unsigned long *out_addr)
+{
+   int reglen;
+
+   *out_addr = 0;
+
+   reglen = getprop(node, prop, prop_buf, sizeof(prop_buf)) / 4;
+   if (reglen == 2) {
+   u64 v0 = be32_to_cpu(prop_buf[0]);
+   u64 v1 = be32_to_cpu(prop_buf[1]);
+   *out_addr = (v0 << 32) | v1;
+   } else if (reglen == 1) {
+   *out_addr = be32_to_cpu(prop_buf[0]);
+   } else {
+   return 0;
+   }
+
+   return 1;
+}
+
 int dt_xlate_addr(void *node, u32 *buf, int buflen, unsigned long *xlated_addr)
 {
 
diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c
index a9d2091..518af24 100644
--- a/arch/powerpc/boot/main.c
+++ b/arch/powerpc/boot/main.c
@@ -112,6 +112,13 @@ static struct addr_range prep_initrd(struct addr_range 
vmlinux, void *chosen,
} else if (initrd_size > 0) {
printf("Using loader supplied ramdisk at 0x%lx-0x%lx\n\r",
   initrd_addr, initrd_addr + initrd_size);
+   } else if (chosen) {
+   unsigned long initrd_end;
+
+   dt_read_addr(chosen, "linux,initrd-start", &initrd_addr);
+   dt_read_addr(chosen, "linux,initrd-end", &initrd_end);
+
+   initrd_size = initrd_end - initrd_addr;
}
 
/* If there's no initrd at all, we're done */
diff --git a/arch/powerpc/boot/of.h b/arch/powerpc/boot/of.h
index 31b2f5d..dc24770 100644
--- a/arch/powerpc/boot/of.h
+++ b/arch/powerpc/boot/of.h
@@ -26,22 +26,6 @@ typedef u16  __be16;
 typedef u32__be32;
 typedef u64__be64;
 
-#ifdef __LITTLE_ENDIAN__
-#define cpu_to_be16(x) swab16(x)
-#define be16_to_cpu(x) swab16(x)
-#define cpu_to_be32(x) swab32(x)
-#define be32_to_cpu(x) swab32(x)
-#define cpu_to_be64(x) swab64(x)
-#define be64_to_cpu(x) swab64(x)
-#else
-#define cpu_to_be16(x) (x)
-#define be16_to_cpu(x) (x)
-#define cpu_to_be32(x) (x)
-#define be32_to_cpu(x) (x)
-#define cpu_to_be64(x) (x)
-#define be64_to_cpu(x) (x)
-#endif
-
 #define PROM_ERROR (-1u)
 
 #endif /* _PPC_BOOT_OF_H_ */
diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h
index e060676..5100dd7 100644
--- a/arch/powerpc/boot/ops.h
+++ b/arch/powerpc/boot/ops.h
@@ -95,6 +95,7 @@ void *simple_alloc_init(char *base, unsigned long heap_size,
 extern void flush_cache(void *, unsigned long);
 int dt_xlate_reg(void *node, int res, unsigned long *addr, unsigned long 
*size);
 int dt_xlate_addr(void *node, u32 *buf, int buflen, unsigned long 
*xlated_addr);
+int dt_read_addr(void *node, const char *prop, unsigned long *out);
 int dt_is_compatible(void *node, const char *compat);
 void dt_get_reg_format(void *node, u32 *naddr, u32 *nsize);
 int dt_get_virtual_reg(void *node, void **addr, int nres);
diff --git a/arch/powerpc/boot/swab.h b/arch/powerpc/boot/swab.h
index 11d2069..82db2c1 100644
--- a/arch/powerpc/boot/swab.h
+++ b/arch/powerpc/boot/swab.h
@@ -27,4 +27,21 @@ static inline u64 swab64(u64 x)
(u64)((x & (u64)0x00ffULL) >> 40) |
(u64)((x & (u64)0xff00ULL) >> 56);
 }
+
+#ifdef __LITTLE_ENDIAN__
+#define cpu_to_be16(x) swab16(x)
+#define be16_to_cpu(x) swab16(

Re: [PATCH] powerpc/boot: Fix the initrd being overwritten under qemu

2019-10-23 Thread Oliver O'Halloran
On Wed, Oct 23, 2019 at 10:21 PM Segher Boessenkool
 wrote:
>
> On Wed, Oct 23, 2019 at 12:36:35PM +1100, Oliver O'Halloran wrote:
> > When booting under OF the zImage expects the initrd address and size to be
> > passed to it using registers r3 and r4. SLOF (guest firmware used by QEMU)
> > currently doesn't do this so the zImage is not aware of the initrd
> > location.  This can result in initrd corruption either though the zImage
> > extracting the vmlinux over the initrd, or by the vmlinux overwriting the
> > initrd when relocating itself.
> >
> > QEMU does put the linux,initrd-start and linux,initrd-end properties into
> > the devicetree to vmlinux to find the initrd. We can work around the SLOF
> > bug by also looking those properties in the zImage.
>
> This is not a bug.  What boot protocol requires passing the initrd start
> and size in GPR3, GPR4?
>
> The CHRP binding (what SLOF implements) requires passing two zeroes here.
> And ePAPR requires passing the address of a device tree and a zero, plus
> something in GPR6 to allow distinguishing what it does.

This is what is assumed by the zImage.pseries. I have no idea where
that assumption comes from,A B

> As Alexey says, initramfs works just fine, so please use that?  initrd was
> deprecated when this code was written already.

That's not what Alexey said and the distinction between an initrd and
an initramfs is completely arbitrary.

>
>
> Segher


Re: [PATCH 09/10] powerpc: Enable OpenCAPI Storage Class Memory driver on bare metal

2019-10-27 Thread Oliver O'Halloran
On Fri, Oct 25, 2019 at 3:51 PM Alastair D'Silva  wrote:
>
> From: Alastair D'Silva 
>
> Enable OpenCAPI Storage Class Memory driver on bare metal
>
> Signed-off-by: Alastair D'Silva 
> ---
>  arch/powerpc/configs/powernv_defconfig | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/arch/powerpc/configs/powernv_defconfig 
> b/arch/powerpc/configs/powernv_defconfig
> index 6658cceb928c..45c0eff94964 100644
> --- a/arch/powerpc/configs/powernv_defconfig
> +++ b/arch/powerpc/configs/powernv_defconfig
> @@ -352,3 +352,7 @@ CONFIG_KVM_BOOK3S_64=m
>  CONFIG_KVM_BOOK3S_64_HV=m
>  CONFIG_VHOST_NET=m
>  CONFIG_PRINTK_TIME=y
> +CONFIG_OCXL_SCM=m

> +CONFIG_DEV_DAX=y
> +CONFIG_DEV_DAX_PMEM=y

These should probably be modules. Having them as builtins will force
their dependencies (i.e. libnvdimm) to be built into the kernel too.

> +CONFIG_FS_DAX=y
> --
> 2.21.0
> ___
> Linux-nvdimm mailing list -- linux-nvd...@lists.01.org
> To unsubscribe send an email to linux-nvdimm-le...@lists.01.org


[PATCH v2 1/3] powernv/iov: Ensure the pdn for VFs always contains a valid PE number

2019-10-28 Thread Oliver O'Halloran
On pseries there is a bug with adding hotplugged devices to an IOMMU group.
For a number of dumb reasons fixing that bug first requires re-working how
VFs are configured on PowerNV. For background, on PowerNV we use the
pcibios_sriov_enable() hook to do two things:

1. Create a pci_dn structure for each of the VFs, and
2. Configure the PHB's internal BARs so the MMIO range for each VF
   maps to a unique PE.

Roughly speaking a PE is the hardware counterpart to a Linux IOMMU group
since all the devices in a PE share the same IOMMU table. A PE also defines
the set of devices that should be isolated in response to a PCI error (i.e.
bad DMA, UR/CA, AER events, etc). When isolated all MMIO and DMA traffic to
and from devicein the PE is blocked by the root complex until the PE is
recovered by the OS.

The requirement to block MMIO causes a giant headache because the P8 PHB
generally uses a fixed mapping between MMIO addresses and PEs.  As a result
we need to delay configuring the IOMMU groups for device until after MMIO
resources are assigned. For physical devices (i.e. non-VFs) the PE
assignment is done in pcibios_setup_bridge() which is called immediately
after the MMIO resources for downstream devices (and the bridge's windows)
are assigned. For VFs the setup is more complicated because:

a) pcibios_setup_bridge() is not called again when VFs are activated, and
b) The pci_dev for VFs are created by generic code which runs after
   pcibios_sriov_enable() is called.

The work around for this is a two step process:

1. A fixup in pcibios_add_device() is used to initialised the cached
   pe_number in pci_dn, then
2. A bus notifier then adds the device to the IOMMU group for the PE
   specified in pci_dn->pe_number.

A side effect fixing the pseries bug mentioned in the first paragraph is
moving the fixup out of pcibios_add_device() and into
pcibios_bus_add_device(), which is called much later. This results in step
2. failing because pci_dn->pe_number won't be initialised when the bus
notifier is run.

We can fix this by removing the need for the fixup. The PE for a VF is
known before the VF is even scanned so we can initialise pci_dn->pe_number
pcibios_sriov_enable() instead. Unfortunately, moving the initialisation
causes two problems:

1. We trip the WARN_ON() in the current fixup code, and
2. The EEH core clears pdn->pe_number when recovering a VF and relies
   on the fixup to correctly re-set it.

The only justification for either of these is a comment in eeh_rmv_device()
suggesting that pdn->pe_number *must* be set to IODA_INVALID_PE in order
for the VF to be scanned. However, this comment appears to have no basis
in reality. Both bugs can be fixed by just deleting the code.

Tested-by: Alexey Kardashevskiy 
Reviewed-by: Alexey Kardashevskiy 
Signed-off-by: Oliver O'Halloran 
---
v2: Re-wrote commit message, got very depressed about the state of things.

The real fix here is to move the IOMMU group setup for both VFs and
PFs into pcibios_bus_add_device() and kill the pcibios_setup_bridge()
hack and the bus notifier hack, but doing that requires some pretty
gnarly changes. The fix here is much less invasive.
---
 arch/powerpc/kernel/eeh_driver.c  |  6 --
 arch/powerpc/platforms/powernv/pci-ioda.c | 19 +++
 arch/powerpc/platforms/powernv/pci.c  |  4 
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index d9279d0..7955fba 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -541,12 +541,6 @@ static void eeh_rmv_device(struct eeh_dev *edev, void 
*userdata)
 
pci_iov_remove_virtfn(edev->physfn, pdn->vf_index);
edev->pdev = NULL;
-
-   /*
-* We have to set the VF PE number to invalid one, which is
-* required to plug the VF successfully.
-*/
-   pdn->pe_number = IODA_INVALID_PE;
 #endif
if (rmv_data)
list_add(&edev->rmv_entry, &rmv_data->removed_vf_list);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5e3172d..70508b3 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1558,6 +1558,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
u16 num_vfs)
 
/* Reserve PE for each VF */
for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+   int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
+   int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
+   struct pci_dn *vf_pdn;
+
if (pdn->m64_single_mode)
pe_num = pdn->pe_num_map[vf_index];
else
@@ -1570,13 +1574,11 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *p

[PATCH v2 2/3] powerpc/pci: Fix pcibios_setup_device() ordering

2019-10-28 Thread Oliver O'Halloran
From: Shawn Anastasio 

Move PCI device setup from pcibios_add_device() and pcibios_fixup_bus() to
pcibios_bus_add_device(). This ensures that platform-specific DMA and IOMMU
setup occurs after the device has been registered in sysfs, which is a
requirement for IOMMU group assignment to work

This fixes IOMMU group assignment for hotplugged devices on pseries, where
the existing behavior results in IOMMU assignment before registration.

Thanks to Lukas Wunner  for the suggestion.

Signed-off-by: Shawn Anastasio 
Tested-by: Alexey Kardashevskiy 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/kernel/pci-common.c | 25 +
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 1c448cf..b89925ed 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -261,12 +261,6 @@ int pcibios_sriov_disable(struct pci_dev *pdev)
 
 #endif /* CONFIG_PCI_IOV */
 
-void pcibios_bus_add_device(struct pci_dev *pdev)
-{
-   if (ppc_md.pcibios_bus_add_device)
-   ppc_md.pcibios_bus_add_device(pdev);
-}
-
 static resource_size_t pcibios_io_size(const struct pci_controller *hose)
 {
 #ifdef CONFIG_PPC64
@@ -987,15 +981,17 @@ static void pcibios_setup_device(struct pci_dev *dev)
ppc_md.pci_irq_fixup(dev);
 }
 
-int pcibios_add_device(struct pci_dev *dev)
+void pcibios_bus_add_device(struct pci_dev *pdev)
 {
-   /*
-* We can only call pcibios_setup_device() after bus setup is complete,
-* since some of the platform specific DMA setup code depends on it.
-*/
-   if (dev->bus->is_added)
-   pcibios_setup_device(dev);
+   /* Perform platform-specific device setup */
+   pcibios_setup_device(pdev);
+
+   if (ppc_md.pcibios_bus_add_device)
+   ppc_md.pcibios_bus_add_device(pdev);
+}
 
+int pcibios_add_device(struct pci_dev *dev)
+{
 #ifdef CONFIG_PCI_IOV
if (ppc_md.pcibios_fixup_sriov)
ppc_md.pcibios_fixup_sriov(dev);
@@ -1037,9 +1033,6 @@ void pcibios_fixup_bus(struct pci_bus *bus)
 
/* Now fixup the bus bus */
pcibios_setup_bus_self(bus);
-
-   /* Now fixup devices on that bus */
-   pcibios_setup_bus_devices(bus);
 }
 EXPORT_SYMBOL(pcibios_fixup_bus);
 
-- 
2.9.5



[PATCH v2 3/3] powerpc/pci: Remove pcibios_setup_bus_devices()

2019-10-28 Thread Oliver O'Halloran
With the previous patch applied pcibios_setup_device() will always be run
when pcibios_bus_add_device() is called. There are several code paths where
pcibios_setup_bus_device() is still called (the PowerPC specific PCI
hotplug support is one) so with just the previous patch applied the setup
can be run multiple times on a device, once before the device is added
to the bus and once after.

There's no need to run the setup in the early case any more so just
remove it entirely.

Signed-off-by: Oliver O'Halloran 
Tested-by: Alexey Kardashevskiy 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/pci.h|  1 -
 arch/powerpc/kernel/pci-common.c  | 25 -
 arch/powerpc/kernel/pci-hotplug.c |  1 -
 arch/powerpc/kernel/pci_of_scan.c |  1 -
 4 files changed, 28 deletions(-)

diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 327567b..63ed7e3 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -113,7 +113,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file 
*file,
 pgprot_t prot);
 
 extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose);
-extern void pcibios_setup_bus_devices(struct pci_bus *bus);
 extern void pcibios_setup_bus_self(struct pci_bus *bus);
 extern void pcibios_setup_phb_io_space(struct pci_controller *hose);
 extern void pcibios_scan_phb(struct pci_controller *hose);
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index b89925ed..f8a59d7 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1000,24 +1000,6 @@ int pcibios_add_device(struct pci_dev *dev)
return 0;
 }
 
-void pcibios_setup_bus_devices(struct pci_bus *bus)
-{
-   struct pci_dev *dev;
-
-   pr_debug("PCI: Fixup bus devices %d (%s)\n",
-bus->number, bus->self ? pci_name(bus->self) : "PHB");
-
-   list_for_each_entry(dev, &bus->devices, bus_list) {
-   /* Cardbus can call us to add new devices to a bus, so ignore
-* those who are already fully discovered
-*/
-   if (pci_dev_is_added(dev))
-   continue;
-
-   pcibios_setup_device(dev);
-   }
-}
-
 void pcibios_set_master(struct pci_dev *dev)
 {
/* No special bus mastering setup handling */
@@ -1036,13 +1018,6 @@ void pcibios_fixup_bus(struct pci_bus *bus)
 }
 EXPORT_SYMBOL(pcibios_fixup_bus);
 
-void pci_fixup_cardbus(struct pci_bus *bus)
-{
-   /* Now fixup devices on that bus */
-   pcibios_setup_bus_devices(bus);
-}
-
-
 static int skip_isa_ioresource_align(struct pci_dev *dev)
 {
if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) &&
diff --git a/arch/powerpc/kernel/pci-hotplug.c 
b/arch/powerpc/kernel/pci-hotplug.c
index fc62c4b..d6a67f8 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -134,7 +134,6 @@ void pci_hp_add_devices(struct pci_bus *bus)
 */
slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
-   pcibios_setup_bus_devices(bus);
max = bus->busn_res.start;
/*
 * Scan bridges that are already configured. We don't touch
diff --git a/arch/powerpc/kernel/pci_of_scan.c 
b/arch/powerpc/kernel/pci_of_scan.c
index f91d7e9..c3024f1 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -414,7 +414,6 @@ static void __of_scan_bus(struct device_node *node, struct 
pci_bus *bus,
 */
if (!rescan_existing)
pcibios_setup_bus_self(bus);
-   pcibios_setup_bus_devices(bus);
 
/* Now scan child busses */
for_each_pci_bridge(dev, bus)
-- 
2.9.5



[PATCH v2 2/2] powerpc/powernv: Use common code for the symbol_map export

2019-10-31 Thread Oliver O'Halloran
Long before we had a generic way for firmware to export memory ranges of
interest we added a special case for the skiboot symbol map. The code is
pretty much identical to the generic export so re-use the code.

Signed-off-by: Oliver O'Halloran 
---
v2: Actually compile.
---
 arch/powerpc/platforms/powernv/opal.c | 48 ---
 1 file changed, 10 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal.c 
b/arch/powerpc/platforms/powernv/opal.c
index 0373da5..e4ea27d 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -708,42 +708,6 @@ static int opal_sysfs_init(void)
return 0;
 }
 
-static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj,
-  struct bin_attribute *bin_attr,
-  char *buf, loff_t off, size_t count)
-{
-   return memory_read_from_buffer(buf, count, &off, bin_attr->private,
-  bin_attr->size);
-}
-
-static struct bin_attribute symbol_map_attr = {
-   .attr = {.name = "symbol_map", .mode = 0400},
-   .read = symbol_map_read
-};
-
-static void opal_export_symmap(void)
-{
-   const __be64 *syms;
-   unsigned int size;
-   struct device_node *fw;
-   int rc;
-
-   fw = of_find_node_by_path("/ibm,opal/firmware");
-   if (!fw)
-   return;
-   syms = of_get_property(fw, "symbol-map", &size);
-   if (!syms || size != 2 * sizeof(__be64))
-   return;
-
-   /* Setup attributes */
-   symbol_map_attr.private = __va(be64_to_cpu(syms[0]));
-   symbol_map_attr.size = be64_to_cpu(syms[1]);
-
-   rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr);
-   if (rc)
-   pr_warn("Error %d creating OPAL symbols file\n", rc);
-}
-
 static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
struct bin_attribute *bin_attr, char *buf,
loff_t off, size_t count)
@@ -832,6 +796,7 @@ static void opal_export_attrs(void)
 {
struct device_node *np;
struct kobject *kobj;
+   int rc;
 
np = of_find_node_by_path("/ibm,opal/firmware/exports");
if (!np)
@@ -846,6 +811,15 @@ static void opal_export_attrs(void)
 
opal_add_exported_attrs(np, kobj);
 
+   /*
+* NB: symbol_map existed before the generic export interface so it
+* lives under the top level opal_kobj.
+*/
+   rc = opal_add_one_export(opal_kobj, "symbol_map",
+np->parent, "symbol-map");
+   if (rc)
+   pr_warn("Error %d creating OPAL symbols file\n", rc);
+
of_node_put(np);
 }
 
@@ -991,8 +965,6 @@ static int __init opal_init(void)
/* Create "opal" kobject under /sys/firmware */
rc = opal_sysfs_init();
if (rc == 0) {
-   /* Export symbol map to userspace */
-   opal_export_symmap();
/* Setup dump region interface */
opal_dump_region_init();
/* Setup error log interface */
-- 
2.9.5



[PATCH v2 1/2] powerpc/powernv: Rework exports to support subnodes

2019-10-31 Thread Oliver O'Halloran
Originally we only had a handful of exported memory ranges, but we'd to
export the per-core trace buffers. This results in a lot of files in the
exports directory which is a but unfortunate. We can clean things up a bit
by turning subnodes into subdirectories of the exports directory.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/opal.c | 114 +-
 1 file changed, 72 insertions(+), 42 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal.c 
b/arch/powerpc/platforms/powernv/opal.c
index 38e9027..0373da5 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -752,6 +752,75 @@ static ssize_t export_attr_read(struct file *fp, struct 
kobject *kobj,
   bin_attr->size);
 }
 
+static int opal_add_one_export(struct kobject *parent, const char *export_name,
+  struct device_node *np, const char *prop_name)
+{
+   struct bin_attribute *attr = NULL;
+   const char *name = NULL;
+   u64 vals[2];
+   int rc;
+
+   rc = of_property_read_u64_array(np, prop_name, &vals[0], 2);
+   if (rc)
+   goto out;
+
+   attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+   name = kstrdup(export_name, GFP_KERNEL);
+   if (!name) {
+   rc = -ENOMEM;
+   goto out;
+   }
+
+   sysfs_bin_attr_init(attr);
+   attr->attr.name = name;
+   attr->attr.mode = 0400;
+   attr->read = export_attr_read;
+   attr->private = __va(vals[0]);
+   attr->size = vals[1];
+
+   rc = sysfs_create_bin_file(parent, attr);
+out:
+   if (rc) {
+   kfree(name);
+   kfree(attr);
+   }
+
+   return rc;
+}
+
+static void opal_add_exported_attrs(struct device_node *np,
+   struct kobject *kobj)
+{
+   struct device_node *child;
+   struct property *prop;
+
+   for_each_property_of_node(np, prop) {
+   int rc;
+
+   if (!strcmp(prop->name, "name") ||
+   !strcmp(prop->name, "phandle"))
+   continue;
+
+   rc = opal_add_one_export(kobj, prop->name, np, prop->name);
+   if (rc) {
+   pr_warn("Unable to add export %pOF/%s, rc = %d!\n",
+   np, prop->name, rc);
+   }
+   }
+
+   for_each_child_of_node(np, child) {
+   struct kobject *child_kobj;
+
+   child_kobj = kobject_create_and_add(child->name, kobj);
+   if (!child_kobj) {
+   pr_err("Unable to create export dir for %pOF\n", child);
+   continue;
+   }
+
+   opal_add_exported_attrs(child, child_kobj);
+   }
+}
+
 /*
  * opal_export_attrs: creates a sysfs node for each property listed in
  * the device-tree under /ibm,opal/firmware/exports/
@@ -761,12 +830,8 @@ static ssize_t export_attr_read(struct file *fp, struct 
kobject *kobj,
  */
 static void opal_export_attrs(void)
 {
-   struct bin_attribute *attr;
struct device_node *np;
-   struct property *prop;
struct kobject *kobj;
-   u64 vals[2];
-   int rc;
 
np = of_find_node_by_path("/ibm,opal/firmware/exports");
if (!np)
@@ -779,41 +844,7 @@ static void opal_export_attrs(void)
return;
}
 
-   for_each_property_of_node(np, prop) {
-   if (!strcmp(prop->name, "name") || !strcmp(prop->name, 
"phandle"))
-   continue;
-
-   if (of_property_read_u64_array(np, prop->name, &vals[0], 2))
-   continue;
-
-   attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-
-   if (attr == NULL) {
-   pr_warn("Failed kmalloc for bin_attribute!");
-   continue;
-   }
-
-   sysfs_bin_attr_init(attr);
-   attr->attr.name = kstrdup(prop->name, GFP_KERNEL);
-   attr->attr.mode = 0400;
-   attr->read = export_attr_read;
-   attr->private = __va(vals[0]);
-   attr->size = vals[1];
-
-   if (attr->attr.name == NULL) {
-   pr_warn("Failed kstrdup for bin_attribute attr.name");
-   kfree(attr);
-   continue;
-   }
-
-   rc = sysfs_create_bin_file(kobj, attr);
-   if (rc) {
-   pr_warn("Error %d creating OPAL sysfs exports/%s 
file\n",
-rc, prop->name);
-   kfree(attr->attr.name);
-   kfree(attr);
-   }
-   }
+   opal_add_expo

[PATCH 1/2] powerpc/xmon: Allow passing an argument to ppc_md.restart()

2019-11-01 Thread Oliver O'Halloran
On PowerNV a few different kinds of reboot are supported. We'd like to be
able to exercise these from xmon so allow 'zr' to take an argument, and
pass that to the ppc_md.restart() function.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/xmon/xmon.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index d83364e..6a6f675 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1147,16 +1147,19 @@ static int do_step(struct pt_regs *regs)
 
 static void bootcmds(void)
 {
+   char tmp[64];
int cmd;
 
cmd = inchar();
-   if (cmd == 'r')
-   ppc_md.restart(NULL);
-   else if (cmd == 'h')
+   if (cmd == 'r') {
+   getstring(tmp, 64);
+   ppc_md.restart(tmp);
+   } else if (cmd == 'h') {
ppc_md.halt();
-   else if (cmd == 'p')
+   } else if (cmd == 'p') {
if (pm_power_off)
pm_power_off();
+   }
 }
 
 static int cpu_cmd(void)
-- 
2.9.5



[PATCH 2/2] powerpc/powernv: Allow manually invoking special reboots

2019-11-01 Thread Oliver O'Halloran
OPAL provides several different kinds of reboot for the kernel to use,
namely forcing a full reboot, platform error reboot and MPIPL. Right now
triggering the alternative resets requires some ad-hoc method such as
triggering a kernel crash and hoping the stars align. It's sometimes handy
to be able to trigger one of these resets directly, so add a way to do
that.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/setup.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/setup.c 
b/arch/powerpc/platforms/powernv/setup.c
index 8349860..11fdae8 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -233,6 +233,10 @@ static void  __noreturn pnv_restart(char *cmd)
rc = opal_cec_reboot();
else if (strcmp(cmd, "full") == 0)
rc = opal_cec_reboot2(OPAL_REBOOT_FULL_IPL, NULL);
+   else if (strcmp(cmd, "mpipl") == 0)
+   rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, NULL);
+   else if (strcmp(cmd, "error") == 0)
+   rc = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, NULL);
else
rc = OPAL_UNSUPPORTED;
 
-- 
2.9.5



Re: [PATCH] powerpc/papr_scm: Delete unnecessary assignment for the field “owner”

2019-11-03 Thread Oliver O'Halloran
On Sun, Nov 3, 2019 at 11:31 PM Markus Elfring  wrote:
>
> From: Markus Elfring 
> Date: Sun, 3 Nov 2019 13:23:13 +0100
>
> The field “owner” is set by the core.
> Thus delete an unneeded initialisation.

Acked-by: Oliver O'Halloran 

>
> Generated by: scripts/coccinelle/api/platform_no_drv_owner.cocci
>
> Signed-off-by: Markus Elfring 
> ---
>  arch/powerpc/platforms/pseries/papr_scm.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index ee07d0718bf1..f87b474d25a7 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -513,7 +513,6 @@ static struct platform_driver papr_scm_driver = {
> .remove = papr_scm_remove,
> .driver = {
> .name = "papr_scm",
> -   .owner = THIS_MODULE,
> .of_match_table = papr_scm_match,
> },
>  };
> --
> 2.23.0
>


Re: PROBLEM: PCIe Bus Error atleast

2019-11-03 Thread Oliver O'Halloran
On Sat, Nov 2, 2019 at 5:46 AM Jeffrin Thalakkottoor
 wrote:
>
> hello ,
>
> i found a error message  as the output of "sudo dmesg -l err"
> i have attached related to that in this email.
> i  think i found this in 5.3.8 kernel

Use "uname -a" to get the current kernel version, architecture.

> But i think when i tried again today i could not reproduce it

That's unfortunate, but it might have just been a transient problem.
The log has a pile of these AER errors:

[  283.723848] pcieport :00:1c.5: AER: PCIe Bus Error:
severity=Corrected, type=Data Link Layer, (Transmitter ID)
[  283.723855] pcieport :00:1c.5: AER:   device [8086:9d15] error
status/mask=1000/2000
[  283.723859] pcieport :00:1c.5: AER:[12] Timeout

Which looks like a root port is getting a timeouts while trying to
talk to its downstream device. It's hard to say anything more without
knowing what the downstream device is, or what the system is. If this
is a laptop it might be due to buggy power management, but it might
just be flakey hardware.

Can you provide the full dmesg and the output of lspci -vv?

Oliver


Re: [PATCH RFC 02/15] powerpc/eeh: Rename eeh_pe_get() to eeh_pe_find()

2019-11-12 Thread Oliver O'Halloran
On Wed, Oct 2, 2019 at 4:03 PM Sam Bobroff  wrote:
>
> There are now functions eeh_get_pe() and eeh_pe_get() which seems
> likely to cause confusion.
>
> Keep eeh_get_pe() because "get" is commonly used to refer to acquiring
> a reference (which it does), and rename eeh_pe_get() to eeh_pe_find()
> because it performs a search.
>
> Signed-off-by: Sam Bobroff 

Good idea.

Reviewed-by: Oliver O'Halloran 


[PATCH] powerpc/powernv: Disable native PCIe port management

2019-11-13 Thread Oliver O'Halloran
On PowerNV the PCIe topology is (currently) managed the powernv platform
code in cooperation with firmware. The PCIe-native service drivers bypass
both and this can cause problems.

Historically this hasn't been a big deal since the only port service
driver that saw much use was the AER driver. The AER driver relies
a kernel service to report when errors occur rather than acting autonmously
so it's fairly easy to ignore. On PowerNV (and pseries) AER events are
handled through EEH, which ignores the AER service, so it's never been
an issue.

Unfortunately, the hotplug port service driver (pciehp) does act
autonomously and conflicts with the platform specific hotplug
driver (pnv_php). The main issue is that pciehp claims the interrupt
associated with the PCIe capability which in turn prevents pnv_php from
claiming it.

This results in hotplug events being handled by pciehp which does not
notify firmware when the PCIe topology changes, and does not setup/teardown
the arch specific PCI device structures (pci_dn) when the topology changes.
The end result is that hot-added devices cannot be enabled and hot-removed
devices may not be fully torn-down on removal.

We can fix these problems by setting the "pcie_ports_disabled" flag during
platform initialisation. The flag indicates the platform owns the PCIe
ports which stops the portbus driver being registered.

Cc: Sergey Miroshnichenko 
Fixes: 66725152fb9f ("PCI/hotplug: PowerPC PowerNV PCI hotplug driver")
Signed-off-by: Oliver O'Halloran 
---
Sergey, just FYI. I'll try sort out the rest of the hotplug
trainwreck in 5.6.

The Fixes: here is for the patch that added pnv_php in 4.8. It's been
a problem since then, but wasn't noticed until people started testing
it after the EEH fixes in commit 799abe283e51 ("powerpc/eeh: Clean up
EEH PEs after recovery finishes") went in earlier in the 5.4 cycle.
---
 arch/powerpc/platforms/powernv/pci.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 2825d00..ae62583 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -941,6 +941,23 @@ void __init pnv_pci_init(void)
 
pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
 
+#ifdef CONFIG_PCIEPORTBUS
+   /*
+* On PowerNV PCIe devices are (currently) managed in cooperation
+* with firmware. This isn't *strictly* required, but there's enough
+* assumptions baked into both firmware and the platform code that
+* it's unwise to allow the portbus services to be used.
+*
+* We need to fix this eventually, but for now set this flag to disable
+* the portbus driver. The AER service isn't required since that AER
+* events are handled via EEH. The pciehp hotplug driver can't work
+* without kernel changes (and portbus binding breaks pnv_php). The
+* other services also require some thinking about how we're going
+* to integrate them.
+*/
+   pcie_ports_disabled = true;
+#endif
+
/* If we don't have OPAL, eg. in sim, just skip PCI probe */
if (!firmware_has_feature(FW_FEATURE_OPAL))
return;
-- 
2.9.5



Re: [PATCH] powerpc/perf: Disable trace_imc pmu

2019-11-13 Thread Oliver O'Halloran
On Thu, Nov 14, 2019 at 6:19 PM Madhavan Srinivasan
 wrote:
>
> When a root user or a user with CAP_SYS_ADMIN
> privilege use trace_imc performance monitoring
> unit events, to monitor application or KVM threads,
> may result in a checkstop (System crash). Reason
> being frequent switch of the "trace/accumulation"
> mode of In-Memory Collection hardware.
> This patch disables trace_imc pmu unit, but will
> be re-enabled at a later stage with a fix patchset.
> ---
>  arch/powerpc/platforms/powernv/opal-imc.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
> b/arch/powerpc/platforms/powernv/opal-imc.c
> index e04b20625cb9..5790f078771f 100644
> --- a/arch/powerpc/platforms/powernv/opal-imc.c
> +++ b/arch/powerpc/platforms/powernv/opal-imc.c
> @@ -285,7 +285,12 @@ static int opal_imc_counters_probe(struct 
> platform_device *pdev)
> domain = IMC_DOMAIN_THREAD;
> break;
> case IMC_TYPE_TRACE:
> -   domain = IMC_DOMAIN_TRACE;
> +   /* Using trace_imc events to monitor
> +* application or KVM thread performances
> +* may result in a checkstop (system crash).
> +* So disabling it for now.
> +*/
> +   domain = -1;
> break;
> default:
> pr_warn("IMC Unknown Device type \n");
> --
> 2.21.0
>

Does this need a Fixes: tag?


Re: [PATCH] powerpc/powernv: Disable native PCIe port management

2019-11-14 Thread Oliver O'Halloran
On Thu, Nov 14, 2019 at 1:31 AM Bjorn Helgaas  wrote:
>
> This is fine, but it feels like sort of a blunt instrument.  Is there
> any practical way to clear pci_host_bridge.native_pcie_hotplug (and
> native_aer if appropriate) for the PHBs in question? That would also
> prevent pciehp from binding.

It is a large hammer, but I don't see a better way to handle it for
the moment. I had another look and my initial assessment was wrong in
that it's the portbus driver which claims the MSI rather than pciehp
itself. The MSI in the PCIe capability is shared between hotplug
events, PMEs, and BW notifications so to make the portbus concept work
the portbus driver needs to own the interrupt. Basicly, pnv_php and
portbus are fundamentally at odds with each other and can't be used
concurrently.

I also think there's some latent issues with the interaction of DPC
and EEH since they operate off the same set of error messages. We
haven't run into any problems yet, but I think that's largely because
we haven't shipped any systems with DPC enabled. In any case, I'd
prefer we disabled portbus until we've fully unpacked what's going on
there.

> We might someday pull portdrv into the PCI core directly instead of as
> a separate driver, and I'm thinking that might be easier if we have
> more specific indications of what the core shouldn't use.

It's not intended to be a permanent change. In the long term I want to
move everything except the initialisation and reset of the PHB out of
firmware and into the kernel so we can use more of the native PCIe
management features.


Re: [PATCH] powerpc/powernv: Disable native PCIe port management

2019-11-14 Thread Oliver O'Halloran
On Thu, Nov 14, 2019 at 7:39 AM Tyrel Datwyler  wrote:
>
> Nothing but pedantic spelling and grammar nits of the commit log follow.
>
> -Tyrel

Thanks. My speeling is bad even on a good day and it was not a good day.


Re: [PATCH 01/11] powerpc/powernv/ioda: Fix ref count for devices with their own PE

2019-11-17 Thread Oliver O'Halloran
On Mon, Nov 18, 2019 at 12:06 PM Alistair Popple  wrote:
>
> On Wednesday, 13 November 2019 4:38:21 AM AEDT Frederic Barrat wrote:
> >
> > However, one question is whether this patch breaks nvlink and if nvlink
> > assumes the devices won’t go away because we explicitly take a reference
> > forever. In npu_dma.c, there are 2 functions which allow to find the GPU
> > associated to a npu device, and vice-versa. Both functions return a
> > pointer to a struct pci_dev, but they don’t take a reference on the
> > device being returned. So that seems dangerous. I’m probably missing
> > something.
> >
> > Alexey, Alistair: what, if anything, guarantees, that the npu or gpu
> > devices stay valid. Is it because we simply don’t provide any means to
> > get rid of them ? Otherwise, don’t we need the callers of
> > pnv_pci_get_gpu_dev() and pnv_pci_get_npu_dev() to worry about reference
> > counting ? I’ve started looking into it and the changes are scary, which
> > explains Greg’s related commit 02c5f5394918b.
>
> To be honest the reference counting looks like it has evolved into something
> quite suspect and I don't think you're missing anything. In practice though we
> likely haven't hit any issues because the original callers didn't store
> references to the pdev which would make the window quite small (although the
> pass through work may have changed that). And as you say there simply wasn't
> any means to get rid of them anyway (EEH, hotplug, etc. has never been
> implemented or supported for GPUs and all sorts of terrible things happen if
> you try).

In other words: leaking a ref is the only safe thing to do.

> The best solution would likely be to review the reference counting and to
> teach callers of get_*_dev() to call pci_put_dev(), etc.

The issue is that the two callers of get_pci_dev() are non-GPL
exported symbols so we don't know what's calling them or what their
expectations are. We be doing whatever makes sense for OpenCAPI and if
that happens to cause a problem for someone else, then they can deal
with it.

Oliver


[PATCH v2] powerpc/powernv: Disable native PCIe port management

2019-11-17 Thread Oliver O'Halloran
On PowerNV the PCIe topology is (currently) managed by the powernv platform
code in Linux in cooperation with the platform firmware. Linux's native
PCIe port service drivers operate independently of both and this can cause
problems.

The main issue is that the portbus driver will conflict with the platform
specific hotplug driver (pnv_php) over ownership of the MSI used to notify
the host when a hotplug event occurs. The portbus driver claims this MSI on
behalf of the individual port services because the same interrupt is used
for hotplug events, PMEs (on root ports), and link bandwidth change
notifications. The portbus driver will always claim the interrupt even if
the individual port service drivers, such as pciehp, are compiled out.

The second, bigger, problem is that the hotplug port service driver
fundamentally does not work on PowerNV. The platform assumes that all
PCI devices have a corresponding arch-specific handle derived from the DT
node for the device (pci_dn) and without one the platform will not allow
a PCI device to be enabled. This problem is largely due to historical
baggage, but it can't be resolved without significant re-factoring of the
platform PCI support.

We can fix these problems in the interim by setting the
"pcie_ports_disabled" flag during platform initialisation. The flag
indicates the platform owns the PCIe ports which stops the portbus driver
from being registered.

This does have the side effect of disabling all port services drivers
that is: AER, PME, BW notifications, hotplug, and DPC. However, this is
not a huge disadvantage on PowerNV since these services are either unused
or handled through other means.

Cc: Sergey Miroshnichenko 
Fixes: 66725152fb9f ("PCI/hotplug: PowerPC PowerNV PCI hotplug driver")
Signed-off-by: Oliver O'Halloran 
---
Sergey, just FYI. I'll try sort out the rest of the hotplug
trainwreck in 5.6.

The Fixes: here is for the patch that added pnv_php in 4.8. It's been
a problem since then, but wasn't noticed until people started testing
it after the EEH fixes in commit 799abe283e51 ("powerpc/eeh: Clean up
EEH PEs after recovery finishes") went in earlier in the 5.4 cycle.
---
v2: Moved setting the flag until after we check for OPAL support. No
actual functional change since we've already probed the platform,
but it looks neater.

Re-wrote the commit message.
---
 arch/powerpc/platforms/powernv/pci.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 2825d00..c0bea75a 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -945,6 +945,23 @@ void __init pnv_pci_init(void)
if (!firmware_has_feature(FW_FEATURE_OPAL))
return;
 
+#ifdef CONFIG_PCIEPORTBUS
+   /*
+* On PowerNV PCIe devices are (currently) managed in cooperation
+* with firmware. This isn't *strictly* required, but there's enough
+* assumptions baked into both firmware and the platform code that
+* it's unwise to allow the portbus services to be used.
+*
+* We need to fix this eventually, but for now set this flag to disable
+* the portbus driver. The AER service isn't required since that AER
+* events are handled via EEH. The pciehp hotplug driver can't work
+* without kernel changes (and portbus binding breaks pnv_php). The
+* other services also require some thinking about how we're going
+* to integrate them.
+*/
+   pcie_ports_disabled = true;
+#endif
+
/* Look for IODA IO-Hubs. */
for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
pnv_pci_init_ioda_hub(np);
-- 
2.9.5



Re: [PATCH 02/11] powerpc/powernv/ioda: Protect PE list

2019-11-19 Thread Oliver O'Halloran
On Tue, Nov 19, 2019 at 11:57 PM Frederic Barrat  wrote:
>
> > Do the other accessors of ioda.pe_list also need mutex protection?
> > pnv_ioda_setup_bus_PE()
> > pnv_pci_dma_bus_setup()
> > pnv_pci_init_ioda_phb()
> > pnv_pci_ioda_setup_PEs()
>
>
> I think we could also use it there, it wouldn't hurt. Those functions
> are called when the kernel is building part of the PCI topology, and
> devices are not really active yet, so I don't think it's absolutely
> required.
>
> I'm actually not sure my patch is needed either. With hotplug, the
> devices can come and go, whereas the PHB remains. So it feels right to
> start protecting the list when adding/removing a device. But I don't
> think we can really have concurrency and have 2 different operations
> adding/removing devices at the same time under the same PHB, at least
> for opencapi. Maybe for PCI, if we have multiple slots under the same
> PHB. Not sure.

Creation of new pci_dev's is serialised by the global pci rescan and
remove lock so on the creation side it's not an issue. However, we can
release IODA PEs in the pci_dev's release function which might be
called without that lock held. It's pretty hard to hit that case
though since it require something to be holding a ref to the pci_dev
even after the driver's ->remove() function has been called.


PCIPOCALYPSE

2019-11-19 Thread Oliver O'Halloran
This series does a few things and probably needs to be split into two or
three smaller ones. I figured I'd post it as-is since I'm sick of sitting
on it and some people wanted people to take a look at it. There's three
parts:

1) Reworking EEH to move the "pseudo-generic" into the platform backend.
2) Moving the point where do do PE assignments for PCIe devices out of
   pcibios_setup_bridge() and into pcibios_bus_add_device().
3) Killing the use of pci_dn in powernv entirely.

It used to be a (much) longer series, but bits and pieces of been
upstreamed or at least posted to the list so I've omitted most of the
pre-reqs. Here is a tree you can build based on today's -next with
everything in it:

https://github.com/oohal/linux/tree/eeh-no-pdn-working

Keep in mind this is all pretty raw and I've tested it on precisely one
P8 PowerNV system. Things not tested:

 -> pseries (not recently anyway)
 -> CAPI
 -> OpenCAPI
 -> Any kind of NVLink

The main TODO is to finish what was started in 2) so that we handle PE
assignments, IOMMU configuration, etc in the same place for each PHB
type. Right now there's three distinct paths:

 1) For normal IODA PHBs (PHB3 and 4) the PE we can assign a device to is
 pinned by the location of it's MMIO BARs. How this is handled depends on
 whether the device is a VF or not, so the two sub cases are:

  a) For normal devices all the devices under a bridge are assigned to a
 PE in a walk done after configuring the bridge window. This causes a
 pile of wierd edge cases when a PCI device is removed without also
 removing it's parent bridge.

  b) For VFs PEs (and MMIO BARs) are assigned when we call sriov_enable() on
 the PF and we "fix up" the software state later on. As a result there's
 some IOMMU group stuff that happens in a bus notifier which runs after
 adding the device to a bus.

 2) For bullshit IODA PHBs (OpenCAPI / NVLink) there is no MMIO pinning
so we can assign a BDFN to an arbitrary PE. For devices under those
PEs are assigned in a per-PHB fixup that runs only once at boot,
just after the the PHB is probed. There doesn't seem to be good
reason for this and the lack of pinning means we should be able to
do it whenever.

This series fixes 1a) by moving the PE assignment into
pcibios_bus_add_device(), which is run per-device. With that change
fixing the other two cases should be relatively straight forward. VFs
will probably still require some special casing since their setup
works differently to normal PCI devices, but we should be able to do
better than the current trainwreck of random hacks occuring in random
places.




[Very RFC 01/46] powerpc/eeh: Don't attempt to restore VF config space after reset

2019-11-19 Thread Oliver O'Halloran
After resetting a VF we call eeh_restore_vf_config() to restore several
registers in the VFs config space. For physical functions this is normally
handled by the pci_reinit_device() OPAL call which requests firmware to
re-program the device with whatever defaults were set at boot time. We
can't use that for VFs since OPAL (being firmware) doesn't know (or care)
about VFs.

However, the fields that are restored here are all marked as reserved for
VFs in the SR-IOV spec. In other words, eeh_restore_vf_config() doesn't
actually do anything.

There is an argument to be made that we should be saving and restoring
some of these fields since they are marked as "Reserved, but Preserve"
(ResvP) to allow these fields to be used in new versions of the SR-IOV.
However, the current code doesn't even do that properly since it assumes
they can be set to whatever the EEH core has assumed to be correct. If
the fields *are* used in future versions of the SR-IOV spec this code
is still broken since it doesn't take into account any changes made
by the driver, or the Linux IOV core.

Given the above, just delete the code. It's broken, it's mis-leading,
and it's getting in the way of doing useful cleanups.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/eeh.c| 59 
 arch/powerpc/platforms/powernv/eeh-powernv.c | 39 +++--
 arch/powerpc/platforms/pseries/eeh_pseries.c | 26 +
 3 files changed, 8 insertions(+), 116 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ae0a9c421d7b..a3b93db972fc 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -742,65 +742,6 @@ static void eeh_restore_dev_state(struct eeh_dev *edev, 
void *userdata)
pci_restore_state(pdev);
 }
 
-int eeh_restore_vf_config(struct pci_dn *pdn)
-{
-   struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
-   u32 devctl, cmd, cap2, aer_capctl;
-   int old_mps;
-
-   if (edev->pcie_cap) {
-   /* Restore MPS */
-   old_mps = (ffs(pdn->mps) - 8) << 5;
-   eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
-2, &devctl);
-   devctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
-   devctl |= old_mps;
-   eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
- 2, devctl);
-
-   /* Disable Completion Timeout if possible */
-   eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP2,
-4, &cap2);
-   if (cap2 & PCI_EXP_DEVCAP2_COMP_TMOUT_DIS) {
-   eeh_ops->read_config(pdn,
-edev->pcie_cap + PCI_EXP_DEVCTL2,
-4, &cap2);
-   cap2 |= PCI_EXP_DEVCTL2_COMP_TMOUT_DIS;
-   eeh_ops->write_config(pdn,
- edev->pcie_cap + PCI_EXP_DEVCTL2,
- 4, cap2);
-   }
-   }
-
-   /* Enable SERR and parity checking */
-   eeh_ops->read_config(pdn, PCI_COMMAND, 2, &cmd);
-   cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR);
-   eeh_ops->write_config(pdn, PCI_COMMAND, 2, cmd);
-
-   /* Enable report various errors */
-   if (edev->pcie_cap) {
-   eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
-2, &devctl);
-   devctl &= ~PCI_EXP_DEVCTL_CERE;
-   devctl |= (PCI_EXP_DEVCTL_NFERE |
-  PCI_EXP_DEVCTL_FERE |
-  PCI_EXP_DEVCTL_URRE);
-   eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
- 2, devctl);
-   }
-
-   /* Enable ECRC generation and check */
-   if (edev->pcie_cap && edev->aer_cap) {
-   eeh_ops->read_config(pdn, edev->aer_cap + PCI_ERR_CAP,
-4, &aer_capctl);
-   aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
-   eeh_ops->write_config(pdn, edev->aer_cap + PCI_ERR_CAP,
- 4, aer_capctl);
-   }
-
-   return 0;
-}
-
 /**
  * pcibios_set_pcie_reset_state - Set PCI-E reset state
  * @dev: pci device struct
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index ef727ecd99cd..b2ac4130fda7 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1649,20 +1649,13 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
if (!edev)
return -EEXIST

[Very RFC 02/46] powernv/pci: Add helper to find ioda_pe from BDFN

2019-11-19 Thread Oliver O'Halloran
Linux has a look-up table for mapping BDFNs to PEs which is updated when we
call into OPAL to update the PHB's internally BDFN<->PE mapping. We can use
this table to the PE for a device without needing to use the cached value
inside the pci_dn.

We'd like to get rid of pci_dn eventually so this patch adds adds a helper
to find the ioda_pe of a BDFN based on the table. This is different to the
existing helper which takes a pci_dev directly because there are some
contexts (e.g. EEH recovery) where we need to check for an existing PE
assignment when no corresponding pci_dev exists.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++
 arch/powerpc/platforms/powernv/pci.h  |  1 +
 2 files changed, 11 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index fdacf98555e9..65b5b121ebad 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -660,6 +660,16 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int 
pe_no)
return state;
 }
 
+struct pnv_ioda_pe *__pnv_ioda_get_pe(struct pnv_phb *phb, u16 bdfn)
+{
+   int pe_number = phb->ioda.pe_rmap[bdfn];
+
+   if (pe_number == IODA_INVALID_PE)
+   return NULL;
+
+   return &phb->ioda.pe_array[pe_number];
+}
+
 struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 {
struct pci_controller *hose = pci_bus_to_host(dev->bus);
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index f914f0b14e4e..01a01739c03e 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -193,6 +193,7 @@ extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
 extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
 extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
 extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
+extern struct pnv_ioda_pe *__pnv_ioda_get_pe(struct pnv_phb *phb, u16 bdfn);
 extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
 extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
-- 
2.21.0



[Very RFC 03/46] powernv/pci: Remove dma_dev_setup() for NPU PHBs

2019-11-19 Thread Oliver O'Halloran
The pnv_pci_dma_dev_setup() only does something when:

1) There PHB contains VFs, or
2) The PHB defines a dma_dev_setup() callback in the pnv_phb structure.

Neither is true for NPU PHBs, so don't set the callback in the 
pci_controller_ops.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 65b5b121ebad..099c0bb1a9b9 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3652,7 +3652,6 @@ static const struct pci_controller_ops 
pnv_pci_ioda_controller_ops = {
 };
 
 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
-   .dma_dev_setup  = pnv_pci_dma_dev_setup,
.setup_msi_irqs = pnv_setup_msi_irqs,
.teardown_msi_irqs  = pnv_teardown_msi_irqs,
.enable_device_hook = pnv_pci_enable_device_hook,
-- 
2.21.0



[Very RFC 04/46] powernv/pci: Move dma_{dev|bus}_setup into pci-ioda.c

2019-11-19 Thread Oliver O'Halloran
These functions are only used from pci-ioda.c. Move them in there and remove
the prototypes from the header files.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 43 +++
 arch/powerpc/platforms/powernv/pci.c  | 43 ---
 arch/powerpc/platforms/powernv/pci.h  |  2 --
 3 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 099c0bb1a9b9..c2b3a5a13004 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3637,6 +3637,49 @@ static void pnv_pci_ioda_shutdown(struct pci_controller 
*hose)
   OPAL_ASSERT_RESET);
 }
 
+void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
+{
+   struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+   struct pnv_phb *phb = hose->private_data;
+#ifdef CONFIG_PCI_IOV
+   struct pnv_ioda_pe *pe;
+
+   /* Fix the VF pdn PE number */
+   if (pdev->is_virtfn) {
+   list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+   if (pe->rid == ((pdev->bus->number << 8) |
+   (pdev->devfn & 0xff))) {
+   pe->pdev = pdev;
+   break;
+   }
+   }
+   }
+#endif /* CONFIG_PCI_IOV */
+
+   if (phb && phb->dma_dev_setup)
+   phb->dma_dev_setup(phb, pdev);
+}
+
+void pnv_pci_dma_bus_setup(struct pci_bus *bus)
+{
+   struct pci_controller *hose = bus->sysdata;
+   struct pnv_phb *phb = hose->private_data;
+   struct pnv_ioda_pe *pe;
+
+   list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+   if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
+   continue;
+
+   if (!pe->pbus)
+   continue;
+
+   if (bus->number == ((pe->rid >> 8) & 0xFF)) {
+   pe->pbus = bus;
+   break;
+   }
+   }
+}
+
 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
.dma_dev_setup  = pnv_pci_dma_dev_setup,
.dma_bus_setup  = pnv_pci_dma_bus_setup,
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index b7761e2e06f8..8b9058b52575 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -810,49 +810,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
return tbl;
 }
 
-void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
-{
-   struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-   struct pnv_phb *phb = hose->private_data;
-#ifdef CONFIG_PCI_IOV
-   struct pnv_ioda_pe *pe;
-
-   /* Fix the VF pdn PE number */
-   if (pdev->is_virtfn) {
-   list_for_each_entry(pe, &phb->ioda.pe_list, list) {
-   if (pe->rid == ((pdev->bus->number << 8) |
-   (pdev->devfn & 0xff))) {
-   pe->pdev = pdev;
-   break;
-   }
-   }
-   }
-#endif /* CONFIG_PCI_IOV */
-
-   if (phb && phb->dma_dev_setup)
-   phb->dma_dev_setup(phb, pdev);
-}
-
-void pnv_pci_dma_bus_setup(struct pci_bus *bus)
-{
-   struct pci_controller *hose = bus->sysdata;
-   struct pnv_phb *phb = hose->private_data;
-   struct pnv_ioda_pe *pe;
-
-   list_for_each_entry(pe, &phb->ioda.pe_list, list) {
-   if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
-   continue;
-
-   if (!pe->pbus)
-   continue;
-
-   if (bus->number == ((pe->rid >> 8) & 0xFF)) {
-   pe->pbus = bus;
-   break;
-   }
-   }
-}
-
 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
 {
struct pci_controller *hose = pci_bus_to_host(dev->bus);
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 01a01739c03e..f23145575048 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -189,8 +189,6 @@ extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, 
unsigned long msr);
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
-extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
-extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
 extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
 extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 extern struct pnv_ioda_pe *__pnv_ioda_get_pe(struct pnv_phb *phb, u16 bdfn);
-- 
2.21.0



[Very RFC 05/46] powernv/pci: Remove the pnv_phb dma_dev_setup callback

2019-11-19 Thread Oliver O'Halloran
This is only ever set for IODA PHBs. The only call site is in
pnv_pci_dma_dev_setup(), which is also only used by normal IODA PHBs, so remove
the callback in favour of a direct call.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 4 +---
 arch/powerpc/platforms/powernv/pci.h  | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index c2b3a5a13004..45f974258766 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3656,8 +3656,7 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
}
 #endif /* CONFIG_PCI_IOV */
 
-   if (phb && phb->dma_dev_setup)
-   phb->dma_dev_setup(phb, pdev);
+   pnv_pci_ioda_dma_dev_setup(phb, pdev);
 }
 
 void pnv_pci_dma_bus_setup(struct pci_bus *bus)
@@ -3940,7 +3939,6 @@ static void __init pnv_pci_init_ioda_phb(struct 
device_node *np,
hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
break;
default:
-   phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
hose->controller_ops = pnv_pci_ioda_controller_ops;
}
 
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index f23145575048..3c33a0c91a69 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -108,7 +108,6 @@ struct pnv_phb {
int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
 unsigned int hwirq, unsigned int virq,
 unsigned int is_64, struct msi_msg *msg);
-   void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
int (*init_m64)(struct pnv_phb *phb);
int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
-- 
2.21.0



[Very RFC 06/46] powerpc/iov: Move VF pdev fixup into pcibios_fixup_iov()

2019-11-19 Thread Oliver O'Halloran
Move this out of the PHB's dma_dev_setup() callback and into the
ppc_md.pcibios_fixup_iov callback. This ensures that the VF PE's
pdev pointer is always valid for the whole time the device is
added the bus.

This isn't strictly required, but it's slightly a slightly more logical
place to do the fixup and it makes dma_dev_setup a bit simpler.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 35 +++
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 45f974258766..c6ea7a504e04 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2910,9 +2910,6 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
pci_dev *pdev)
struct pci_dn *pdn;
int mul, total_vfs;
 
-   if (!pdev->is_physfn || pci_dev_is_added(pdev))
-   return;
-
pdn = pci_get_pdn(pdev);
pdn->vfs_expanded = 0;
pdn->m64_single_mode = false;
@@ -2987,6 +2984,22 @@ static void pnv_pci_ioda_fixup_iov_resources(struct 
pci_dev *pdev)
res->end = res->start - 1;
}
 }
+
+static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
+{
+   if (WARN_ON(pci_dev_is_added(pdev)))
+   return;
+
+   if (pdev->is_virtfn) {
+   /* Fix the VF PE's pdev pointer */
+   struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
+   pe->pdev = pdev;
+
+   WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
+   } else if (pdev->is_physfn) {
+   pnv_pci_ioda_fixup_iov_resources(pdev);
+   }
+}
 #endif /* CONFIG_PCI_IOV */
 
 static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
@@ -3641,20 +3654,6 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
 {
struct pci_controller *hose = pci_bus_to_host(pdev->bus);
struct pnv_phb *phb = hose->private_data;
-#ifdef CONFIG_PCI_IOV
-   struct pnv_ioda_pe *pe;
-
-   /* Fix the VF pdn PE number */
-   if (pdev->is_virtfn) {
-   list_for_each_entry(pe, &phb->ioda.pe_list, list) {
-   if (pe->rid == ((pdev->bus->number << 8) |
-   (pdev->devfn & 0xff))) {
-   pe->pdev = pdev;
-   break;
-   }
-   }
-   }
-#endif /* CONFIG_PCI_IOV */
 
pnv_pci_ioda_dma_dev_setup(phb, pdev);
 }
@@ -3945,7 +3944,7 @@ static void __init pnv_pci_init_ioda_phb(struct 
device_node *np,
ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
 
 #ifdef CONFIG_PCI_IOV
-   ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
+   ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov;
ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
-- 
2.21.0



[Very RFC 07/46] powernv/pci: Rework IODA PE device accounting

2019-11-19 Thread Oliver O'Halloran
The current process for configuring the IODA PEs for normal PCI devices is
abit stupid. After assigning MMIO resources for the devices on a bus the
actual PE asignment occurs when pcibios_setup_bridge() is called for the
parent bridge. In pcibios_setup_bridge() we:

1. Assign all 256 devfn's for the subordinate bus to the PE corresponding
   to the bridge window.
2. Initialise the IOMMU tables for that PE.
3. Traverse each device on the bus below the bridge setting the IOMMU table
   pointer to point at the PE's table.
4. Finally, set pci_dn->pe_number to indicate that we've done the
   per-device setup and allow EEH and the platform code to look up
   the PE number.

Although mostly functional, there's a couple of issues with this approach.
The most glaring is that it mixes the per-bus (per-PE really) setup with
the per-device setup in a way that's completely asymmetric to what happens
when tearing down a device.

In step 4. the number of devices in the PE is counted and stored in the
ioda_pe structure. When releasing a pci_dev the device count is dropped
until it hits zero where the ioda_pe itself is torn down. However, the bus
itself remains active and can be re-scanned to bring back the devices that
were removed. On a rescan we do not re-run the bridge setup so the
per-device setup is never re-done which results in the re-scanned being
unusable.

There are a few other minor issues too. Associating all 256 devfns with
the PE means that config accesses to non-existant PCI devices results
in a spurious PE freezes. We currently prevent this by only allowing config
accesses to a BDFN when there is a corresponding pci_dn structure. We
would like to eliminate that restriction in the future though.

That all said the biggest issue is that the current behaviour is hard to
follow at the best of times. On top of that the behaviour is slightly (or
majorly) different across each PHB type (PCIe, OpenCAPI, NVLink) and the
behaviour for physical devices (described above) and virtual functions is
again different. To address this we want to merge the paths as much as
possible so that the PowerNV specific PCI initialisation steps all occur
at roughly the same point in the PCI device setup path.

We can address most of these problems by moving the PE setup out of
pcibios_setup_bridge() and into pcibios_bus_add_device(). The latter is
called on a per-device basis so we have some symmetry between the setup and
teardown paths. Moving the PE assignments to here should also allow us to
converge how PE assignment works on all PHB types so it's always done in
one place.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 112 +++---
 1 file changed, 58 insertions(+), 54 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index c6ea7a504e04..c74521e5f3ab 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -51,6 +51,7 @@ static const char * const pnv_phb_names[] = { "IODA1", 
"IODA2", "NPU_NVLINK",
  "NPU_OCAPI" };
 
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+static void pnv_pci_configure_bus(struct pci_bus *bus);
 
 void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...)
@@ -1104,34 +1105,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct 
pci_dev *dev)
return pe;
 }
 
-static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
-{
-   struct pci_dev *dev;
-
-   list_for_each_entry(dev, &bus->devices, bus_list) {
-   struct pci_dn *pdn = pci_get_pdn(dev);
-
-   if (pdn == NULL) {
-   pr_warn("%s: No device node associated with device !\n",
-   pci_name(dev));
-   continue;
-   }
-
-   /*
-* In partial hotplug case, the PCI device might be still
-* associated with the PE and needn't attach it to the PE
-* again.
-*/
-   if (pdn->pe_number != IODA_INVALID_PE)
-   continue;
-
-   pe->device_count++;
-   pdn->pe_number = pe->pe_number;
-   if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
-   pnv_ioda_setup_same_PE(dev->subordinate, pe);
-   }
-}
-
 /*
  * There're 2 types of PCI bus sensitive PEs: One that is compromised of
  * single PCI bus. Another one that contains the primary PCI bus and its
@@ -1152,7 +1125,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct 
pci_bus *bus, bool all)
pe_num = phb->ioda.pe_rmap[bus->number << 8];
if (

[Very RFC 08/46] powerpc/eeh: Calculate VF index rather than looking it up in pci_dn

2019-11-19 Thread Oliver O'Halloran
Find the VF index based on the BDFN of the device rather than using a cached
value in the pci_dn. This is probably slower than looking up the cached value
in the pci_dn, but it's done infrequently (only in the EEH recovery path) and
it's just arithmatic.

We need this here because the functions to remove a VF are slightly
different to those which remove a physical PCI device.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/eeh_driver.c | 44 +++-
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index a1eaffe868de..1cdeed464aed 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -457,12 +457,35 @@ static enum pci_ers_result eeh_report_failure(struct 
eeh_dev *edev,
return rc;
 }
 
+#ifdef CONFIG_PCI_IOV
+/* FIXME: this should probably go in drivers/pci/iov.c */
+static int eeh_find_vf_index(struct pci_dev *physfn, u16 vf_bdfn)
+{
+   u16 vf_bus, vf_devfn;
+   int i;
+
+   vf_bus = vf_bdfn >> 8;
+   vf_devfn = vf_bdfn & 0xff;
+
+   for (i = 0; i < pci_num_vf(physfn); i++) {
+   if (pci_iov_virtfn_bus(physfn, i) != vf_bus)
+   continue;
+   if (pci_iov_virtfn_devfn(physfn, i) != vf_devfn)
+   continue;
+   return i;
+   }
+
+   WARN_ON(1);
+   return -1;
+}
+
 static void *eeh_add_virt_device(struct eeh_dev *edev)
 {
-   struct pci_driver *driver;
struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+   struct pci_driver *driver;
+   int vf_index;
 
-   if (!(edev->physfn)) {
+   if (!edev->physfn) {
eeh_edev_warn(edev, "Not for VF\n");
return NULL;
}
@@ -476,11 +499,18 @@ static void *eeh_add_virt_device(struct eeh_dev *edev)
eeh_pcid_put(dev);
}
 
-#ifdef CONFIG_PCI_IOV
-   pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index);
-#endif
+   vf_index = eeh_find_vf_index(edev->physfn, edev->bdfn);
+   pci_iov_add_virtfn(edev->physfn, vf_index);
+
return NULL;
 }
+#else
+static void *eeh_add_virt_device(struct eeh_dev *edev)
+{
+   WARN_ON(1);
+   return NULL;
+}
+#endif
 
 static void eeh_rmv_device(struct eeh_dev *edev, void *userdata)
 {
@@ -521,9 +551,9 @@ static void eeh_rmv_device(struct eeh_dev *edev, void 
*userdata)
 
if (edev->physfn) {
 #ifdef CONFIG_PCI_IOV
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+   int vf_index = eeh_find_vf_index(edev->physfn, edev->bdfn);
 
-   pci_iov_remove_virtfn(edev->physfn, pdn->vf_index);
+   pci_iov_remove_virtfn(edev->physfn, vf_index);
edev->pdev = NULL;
 #endif
if (rmv_data)
-- 
2.21.0



[Very RFC 09/46] powerpc/eeh: Pass eeh_dev to eeh_ops->{read|write}_config()

2019-11-19 Thread Oliver O'Halloran
Switch the eeh_ops->{read|write}_config methods to take an eeh_dev structure
rather than a pci_dn structure to specify the target device. This removes a
lot of the uses of pci_dn in both the EEH core and in the platform EEH
support.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/include/asm/eeh.h   |  4 +-
 arch/powerpc/kernel/eeh.c| 22 +-
 arch/powerpc/kernel/eeh_pe.c | 44 ++--
 arch/powerpc/platforms/powernv/eeh-powernv.c | 43 ++-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 16 ---
 5 files changed, 67 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index e11deb284631..62c4ee44ad2c 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -224,8 +224,8 @@ struct eeh_ops {
int (*configure_bridge)(struct eeh_pe *pe);
int (*err_inject)(struct eeh_pe *pe, int type, int func,
  unsigned long addr, unsigned long mask);
-   int (*read_config)(struct pci_dn *pdn, int where, int size, u32 *val);
-   int (*write_config)(struct pci_dn *pdn, int where, int size, u32 val);
+   int (*read_config)(struct eeh_dev *edev, int where, int size, u32 *val);
+   int (*write_config)(struct eeh_dev *edev, int where, int size, u32 val);
int (*next_error)(struct eeh_pe **pe);
int (*restore_config)(struct pci_dn *pdn);
int (*notify_resume)(struct pci_dn *pdn);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index a3b93db972fc..7258fa04176d 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -185,21 +185,21 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
pdn->phb->global_number, pdn->busno,
PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
 
-   eeh_ops->read_config(pdn, PCI_VENDOR_ID, 4, &cfg);
+   eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, &cfg);
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
 
-   eeh_ops->read_config(pdn, PCI_COMMAND, 4, &cfg);
+   eeh_ops->read_config(edev, PCI_COMMAND, 4, &cfg);
n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
 
/* Gather bridge-specific registers */
if (edev->mode & EEH_DEV_BRIDGE) {
-   eeh_ops->read_config(pdn, PCI_SEC_STATUS, 2, &cfg);
+   eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, &cfg);
n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
 
-   eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &cfg);
+   eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &cfg);
n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
pr_warn("EEH: Bridge control: %04x\n", cfg);
}
@@ -207,11 +207,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
/* Dump out the PCI-X command and status regs */
cap = edev->pcix_cap;
if (cap) {
-   eeh_ops->read_config(pdn, cap, 4, &cfg);
+   eeh_ops->read_config(edev, cap, 4, &cfg);
n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg);
pr_warn("EEH: PCI-X cmd: %08x\n", cfg);
 
-   eeh_ops->read_config(pdn, cap+4, 4, &cfg);
+   eeh_ops->read_config(edev, cap+4, 4, &cfg);
n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg);
pr_warn("EEH: PCI-X status: %08x\n", cfg);
}
@@ -223,7 +223,7 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
pr_warn("EEH: PCI-E capabilities and status follow:\n");
 
for (i=0; i<=8; i++) {
-   eeh_ops->read_config(pdn, cap+4*i, 4, &cfg);
+   eeh_ops->read_config(edev, cap+4*i, 4, &cfg);
n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
 
if ((i % 4) == 0) {
@@ -250,7 +250,7 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
pr_warn("EEH: PCI-E AER capability register set follows:\n");
 
for (i=0; i<=13; i++) {
-   eeh_ops->read_config(pdn, cap+4*i, 4, &cfg);
+   eeh_ops->read_config(edev, cap+4*i, 4, &cfg);
n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
 
if ((i % 4) ==

[Very RFC 10/46] powerpc/eeh: Pass eeh_dev to eeh_ops->restore_config()

2019-11-19 Thread Oliver O'Halloran
Remove another pdn usage.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/include/asm/eeh.h   |  2 +-
 arch/powerpc/kernel/eeh.c|  5 ++---
 arch/powerpc/kernel/eeh_pe.c |  6 ++
 arch/powerpc/platforms/powernv/eeh-powernv.c | 11 +--
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 62c4ee44ad2c..67847f8dfe71 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -227,7 +227,7 @@ struct eeh_ops {
int (*read_config)(struct eeh_dev *edev, int where, int size, u32 *val);
int (*write_config)(struct eeh_dev *edev, int where, int size, u32 val);
int (*next_error)(struct eeh_pe **pe);
-   int (*restore_config)(struct pci_dn *pdn);
+   int (*restore_config)(struct eeh_dev *edev);
int (*notify_resume)(struct pci_dn *pdn);
 };
 
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 7258fa04176d..63500e34e329 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -726,7 +726,6 @@ static void eeh_disable_and_save_dev_state(struct eeh_dev 
*edev,
 
 static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
 {
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
struct pci_dev *dev = userdata;
 
@@ -734,8 +733,8 @@ static void eeh_restore_dev_state(struct eeh_dev *edev, 
void *userdata)
return;
 
/* Apply customization from firmware */
-   if (pdn && eeh_ops->restore_config)
-   eeh_ops->restore_config(pdn);
+   if (eeh_ops->restore_config)
+   eeh_ops->restore_config(edev);
 
/* The caller should restore state for the specified device */
if (pdev != dev)
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index e11e0830f125..634963aa4a77 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -841,16 +841,14 @@ static void eeh_restore_device_bars(struct eeh_dev *edev)
  */
 static void eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag)
 {
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
-
/* Do special restore for bridges */
if (edev->mode & EEH_DEV_BRIDGE)
eeh_restore_bridge_bars(edev);
else
eeh_restore_device_bars(edev);
 
-   if (eeh_ops->restore_config && pdn)
-   eeh_ops->restore_config(pdn);
+   if (eeh_ops->restore_config)
+   eeh_ops->restore_config(edev);
 }
 
 /**
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 54d8ec77aef2..6c5d9f1bc378 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1644,12 +1644,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
return ret;
 }
 
-static int pnv_eeh_restore_config(struct pci_dn *pdn)
+static int pnv_eeh_restore_config(struct eeh_dev *edev)
 {
-   struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
struct pnv_phb *phb;
s64 ret = 0;
-   int config_addr = (pdn->busno << 8) | (pdn->devfn);
 
if (!edev)
return -EEXIST;
@@ -1658,13 +1656,14 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
if (edev->physfn)
return 0;
 
-   phb = pdn->phb->private_data;
+   phb = edev->pe->phb->private_data;
ret = opal_pci_reinit(phb->opal_id,
- OPAL_REINIT_PCI_DEV, config_addr);
+ OPAL_REINIT_PCI_DEV, edev->bdfn);
 
+   ret = opal_pci_reinit(phb->opal_id, OPAL_REINIT_PCI_DEV, edev->bdfn);
if (ret) {
pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
-   __func__, config_addr, ret);
+   __func__, edev->bdfn, ret);
return -EIO;
}
 
-- 
2.21.0



[Very RFC 11/46] powerpc/eeh: Convert various printfs to use edev, not pci_dn

2019-11-19 Thread Oliver O'Halloran
We use the pci_dn to retrieve the domain, bus, device, and function numbers for
an EEH device. We now have that in the eeh_dev so covert the various printk()s
we have around the place to source that information from the eeh_dev.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/eeh.c| 14 --
 arch/powerpc/kernel/eeh_pe.c | 14 ++
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 63500e34e329..c8039fdb23ba 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -167,23 +167,17 @@ void eeh_show_enabled(void)
  */
 static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
 {
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
u32 cfg;
int cap, i;
int n = 0, l = 0;
char buffer[128];
 
-   if (!pdn) {
-   pr_warn("EEH: Note: No error log for absent device.\n");
-   return 0;
-   }
-
n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
-  pdn->phb->global_number, pdn->busno,
-  PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+   edev->pe->phb->global_number, edev->bdfn >> 8,
+   PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
-   pdn->phb->global_number, pdn->busno,
-   PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+   edev->pe->phb->global_number, edev->bdfn >> 8,
+   PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
 
eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, &cfg);
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 634963aa4a77..831f363f1732 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -366,9 +366,8 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev 
*edev)
  */
 int eeh_add_to_parent_pe(struct eeh_dev *edev)
 {
+   int config_addr = edev->bdfn;
struct eeh_pe *pe, *parent;
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
-   int config_addr = (pdn->busno << 8) | (pdn->devfn);
 
/* Check if the PE number is valid */
if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) {
@@ -382,7 +381,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 * PE should be composed of PCI bus and its subordinate
 * components.
 */
-   pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr);
+   pe = eeh_pe_get(edev->controller, edev->pe_config_addr, config_addr);
if (pe) {
if (pe->type & EEH_PE_INVALID) {
list_add_tail(&edev->entry, &pe->edevs);
@@ -416,9 +415,9 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 
/* Create a new EEH PE */
if (edev->physfn)
-   pe = eeh_pe_alloc(pdn->phb, EEH_PE_VF);
+   pe = eeh_pe_alloc(edev->controller, EEH_PE_VF);
else
-   pe = eeh_pe_alloc(pdn->phb, EEH_PE_DEVICE);
+   pe = eeh_pe_alloc(edev->controller, EEH_PE_DEVICE);
if (!pe) {
pr_err("%s: out of memory!\n", __func__);
return -ENOMEM;
@@ -434,10 +433,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 */
parent = eeh_pe_get_parent(edev);
if (!parent) {
-   parent = eeh_phb_pe_get(pdn->phb);
+   parent = eeh_phb_pe_get(edev->controller);
if (!parent) {
pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
-   __func__, pdn->phb->global_number);
+   __func__, edev->controller->global_number);
edev->pe = NULL;
kfree(pe);
return -EEXIST;
@@ -698,7 +697,6 @@ void eeh_pe_state_clear(struct eeh_pe *root, int state, 
bool include_passed)
  */
 static void eeh_bridge_check_link(struct eeh_dev *edev)
 {
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
int cap;
uint32_t val;
int timeout = 0;
-- 
2.21.0



[Very RFC 12/46] powerpc/eeh: Split eeh_probe into probe_pdn and probe_pdev

2019-11-19 Thread Oliver O'Halloran
The EEH core has a concept of "early probe" and "late probe." When the
EEH_PROBE_MODE_DEVTREE flag is set (i.e pseries) we call the eeh_ops->probe()
function in eeh_add_device_early() so the eeh_dev state is initialised based on
the pci_dn. It's important to realise that this happens *long* before the PCI
device has been probed and a pci_dev structure created. This is necessary due
to a PAPR requirement that EEH be enabled before to OS starts interacting
with the device.

The late probe is done in eeh_add_device_late() when the EEH_PROBE_MODE_DEV
flag is set (i.e. PowerNV). The main difference is the late probe happens
after the pci_dev has been created. As a result there is no actual dependency
on a pci_dn in the late probe case. Splitting the single eeh_ops->probe()
function into seperate functions allows us to simplify the late probe case
since we have access to a pci_dev at that point. Having access to a pci_dev
means that we can use the functions provided by the PCI core for finding
capabilities, etc rather than doing it manually.

It also changes the prototype for the probe functions to be void. Currently
they return a void *, but both implementations always return NULL so there's
not much point to it.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/include/asm/eeh.h   |  3 +-
 arch/powerpc/kernel/eeh.c|  6 ++--
 arch/powerpc/platforms/powernv/eeh-powernv.c | 29 ++--
 arch/powerpc/platforms/pseries/eeh_pseries.c | 13 -
 4 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 67847f8dfe71..466b0165fbcf 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -215,7 +215,8 @@ enum {
 struct eeh_ops {
char *name;
int (*init)(void);
-   void* (*probe)(struct pci_dn *pdn, void *data);
+   void (*probe_pdn)(struct pci_dn *pdn);/* used on pseries */
+   void (*probe_pdev)(struct pci_dev *pdev); /* used on powernv */
int (*set_option)(struct eeh_pe *pe, int option);
int (*get_pe_addr)(struct eeh_pe *pe);
int (*get_state)(struct eeh_pe *pe, int *delay);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index c8039fdb23ba..087a98b42a8c 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1066,7 +1066,7 @@ void eeh_add_device_early(struct pci_dn *pdn)
(eeh_has_flag(EEH_PROBE_MODE_DEVTREE) && 0 == phb->buid))
return;
 
-   eeh_ops->probe(pdn, NULL);
+   eeh_ops->probe_pdn(pdn);
 }
 
 /**
@@ -1135,8 +1135,8 @@ void eeh_add_device_late(struct pci_dev *dev)
dev->dev.archdata.edev = NULL;
}
 
-   if (eeh_has_flag(EEH_PROBE_MODE_DEV))
-   eeh_ops->probe(pdn, NULL);
+   if (eeh_ops->probe_pdev && eeh_has_flag(EEH_PROBE_MODE_DEV))
+   eeh_ops->probe_pdev(dev);
 
edev->pdev = dev;
dev->dev.archdata.edev = edev;
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6c5d9f1bc378..8bd5317aa878 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -346,23 +346,13 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap)
 
 /**
  * pnv_eeh_probe - Do probe on PCI device
- * @pdn: PCI device node
- * @data: unused
+ * @pdev: pci_dev to probe
  *
- * When EEH module is installed during system boot, all PCI devices
- * are checked one by one to see if it supports EEH. The function
- * is introduced for the purpose. By default, EEH has been enabled
- * on all PCI devices. That's to say, we only need do necessary
- * initialization on the corresponding eeh device and create PE
- * accordingly.
- *
- * It's notable that's unsafe to retrieve the EEH device through
- * the corresponding PCI device. During the PCI device hotplug, which
- * was possiblly triggered by EEH core, the binding between EEH device
- * and the PCI device isn't built yet.
+ * Creates (or finds an existing) edev for this pci_dev.
  */
-static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
+static void pnv_eeh_probe_pdev(struct pci_dev *pdev)
 {
+   struct pci_dn *pdn = pci_get_pdn(pdev);
struct pci_controller *hose = pdn->phb;
struct pnv_phb *phb = hose->private_data;
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
@@ -377,11 +367,11 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 * the probing.
 */
if (!edev || edev->pe)
-   return NULL;
+   return;
 
/* Skip for PCI-ISA bridge */
if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
-   return NULL;
+   return;
 
eeh_edev_dbg(edev, "Probing device\n");
 
@@ -

[Very RFC 13/46] powerpc/eeh: Rework how pdev_probe() is used

2019-11-19 Thread Oliver O'Halloran
Adjust how the EEH core uses the eeh_ops->probe_pdev() so that it returns
the eeh_dev for the passed-in pci_dev. Currently mapping an pci_dev to an
eeh_dev is done by finding the pci_dn for the pci_dev, then using the
back-pointer to the eeh_dev stashed in the pci_dn.

We want to move away from using pci_dn on PowerNV and moving the eeh_dev
lookup into probe_pdev() allows the EEH core to be oblivious of how the
mapping is actually done.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/include/asm/eeh.h   | 16 +++--
 arch/powerpc/kernel/eeh.c| 34 
 arch/powerpc/platforms/powernv/eeh-powernv.c | 20 +---
 arch/powerpc/platforms/pseries/eeh_pseries.c | 19 ++-
 4 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 466b0165fbcf..e109bfd3dd57 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -215,8 +215,20 @@ enum {
 struct eeh_ops {
char *name;
int (*init)(void);
-   void (*probe_pdn)(struct pci_dn *pdn);/* used on pseries */
-   void (*probe_pdev)(struct pci_dev *pdev); /* used on powernv */
+
+   /*
+* on pseries the eeh_dev is initialised before the pci_dev exists
+* using the contents of the pci_dn.
+*/
+   void (*probe_pdn)(struct pci_dn *pdn);
+
+   /*
+* probe_pdev() is used to find, and possibly create, an eeh_dev
+* for a pci_dev. The EEH core binds the returned device to the
+* pci_dev.
+*/
+   struct eeh_dev *(*probe_pdev)(struct pci_dev *pdev);
+
int (*set_option)(struct eeh_pe *pe, int option);
int (*get_pe_addr)(struct eeh_pe *pe);
int (*get_state)(struct eeh_pe *pe, int *delay);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 087a98b42a8c..58a8299ac417 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1099,17 +1099,24 @@ EXPORT_SYMBOL_GPL(eeh_add_device_tree_early);
  */
 void eeh_add_device_late(struct pci_dev *dev)
 {
-   struct pci_dn *pdn;
struct eeh_dev *edev;
 
if (!dev)
return;
 
-   pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
-   edev = pdn_to_eeh_dev(pdn);
-   eeh_edev_dbg(edev, "Adding device\n");
-   if (edev->pdev == dev) {
-   eeh_edev_dbg(edev, "Device already referenced!\n");
+   pr_debug("EEH: Adding device %s\n", pci_name(dev));
+
+   /* pci_dev_to_eeh_dev() can only work if archdata.edev is already set */
+   edev = pci_dev_to_eeh_dev(dev);
+   if (edev) {
+   /* FIXME: I don't remember why this isn't an error, but it's 
not */
+   eeh_edev_dbg(edev, "Already bound to an eeh_dev!\n");
+   return;
+   }
+
+   edev = eeh_ops->probe_pdev(dev);
+   if (!edev) {
+   pr_debug("EEH: Adding device failed\n");
return;
}
 
@@ -1118,8 +1125,13 @@ void eeh_add_device_late(struct pci_dev *dev)
 * unbalanced kref to the device during unplug time, which
 * relies on pcibios_release_device(). So we have to remove
 * that here explicitly.
+*
+* FIXME: This really shouldn't be necessary. We should probably
+* tear down the EEH state when we detatch the pci_dev from the
+* bus. We might need to move the bus notifiers out of the platforms
+* first.
 */
-   if (edev->pdev) {
+   if (edev->pdev && edev->pdev != dev) {
eeh_rmv_from_parent_pe(edev);
eeh_addr_cache_rmv_dev(edev->pdev);
eeh_sysfs_remove_device(edev->pdev);
@@ -1130,17 +1142,11 @@ void eeh_add_device_late(struct pci_dev *dev)
 * into error handler afterwards.
 */
edev->mode |= EEH_DEV_NO_HANDLER;
-
-   edev->pdev = NULL;
-   dev->dev.archdata.edev = NULL;
}
 
-   if (eeh_ops->probe_pdev && eeh_has_flag(EEH_PROBE_MODE_DEV))
-   eeh_ops->probe_pdev(dev);
-
+   /* bind the pdev and the edev together */
edev->pdev = dev;
dev->dev.archdata.edev = edev;
-
eeh_addr_cache_insert_dev(dev);
eeh_sysfs_add_device(dev);
 }
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 8bd5317aa878..5250c4525544 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -348,9 +348,9 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap)
  * pnv_eeh_probe - Do probe on PCI device
  * @pdev: pci_dev to probe
  *
- * Creates (or finds an existing) edev for this pci_dev.
+ * Create, or find the existing, eeh_dev for this pci_dev.

[Very RFC 14/46] powernv/eeh: Remove un-necessary call to eeh_add_device_early()

2019-11-19 Thread Oliver O'Halloran
eeh_add_device_early() is used to initialise the EEH state for a PCI device
based on the contents of it's devicetree node. It doesn't do anything
unless EEH_FLAG_PROBE_MODE_DEVTREE is set and that only happens on pseries.

Remove the call to eeh_add_device_early() in the powernv code to squash
another pci_dn usage.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 5250c4525544..aa2935a08464 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -40,13 +40,10 @@ static int eeh_event_irq = -EINVAL;
 
 void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
 {
-   struct pci_dn *pdn = pci_get_pdn(pdev);
-
-   if (!pdn || eeh_has_flag(EEH_FORCE_DISABLED))
+   if (eeh_has_flag(EEH_FORCE_DISABLED))
return;
 
dev_dbg(&pdev->dev, "EEH: Setting up device\n");
-   eeh_add_device_early(pdn);
eeh_add_device_late(pdev);
 }
 
-- 
2.21.0



[Very RFC 15/46] powernv/eeh: Use pnv_eeh_*_config() for internal config ops

2019-11-19 Thread Oliver O'Halloran
Use the pnv_eeh_{read|write}_config() functions that take an edev rather
than a pci_dn. This allows us to remove most of the explict uses of pci_dn
in the PowerNV EEH backend and localises them into a few functions which we
can fix later.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 153 +--
 1 file changed, 70 insertions(+), 83 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index aa2935a08464..aaccb3768393 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -278,27 +278,73 @@ int pnv_eeh_post_init(void)
return ret;
 }
 
-static int pnv_eeh_find_cap(struct pci_dn *pdn, int cap)
+static inline bool pnv_eeh_cfg_blocked(struct eeh_dev *edev)
+{
+   if (!edev || !edev->pe)
+   return false;
+
+   /*
+* We will issue FLR or AF FLR to all VFs, which are contained
+* in VF PE. It relies on the EEH PCI config accessors. So we
+* can't block them during the window.
+*/
+   if (edev->physfn && (edev->pe->state & EEH_PE_RESET))
+   return false;
+
+   if (edev->pe->state & EEH_PE_CFG_BLOCKED)
+   return true;
+
+   return false;
+}
+
+static int pnv_eeh_read_config(struct eeh_dev *edev,
+  int where, int size, u32 *val)
+{
+   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+   if (!pdn)
+   return PCIBIOS_DEVICE_NOT_FOUND;
+
+   if (pnv_eeh_cfg_blocked(edev)) {
+   *val = 0x;
+   return PCIBIOS_SET_FAILED;
+   }
+
+   return pnv_pci_cfg_read(pdn, where, size, val);
+}
+
+static int pnv_eeh_write_config(struct eeh_dev *edev,
+   int where, int size, u32 val)
+{
+   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+   if (!pdn)
+   return PCIBIOS_DEVICE_NOT_FOUND;
+
+   if (pnv_eeh_cfg_blocked(edev))
+   return PCIBIOS_SET_FAILED;
+
+   return pnv_pci_cfg_write(pdn, where, size, val);
+}
+
+static int pnv_eeh_find_cap(struct eeh_dev *edev, int cap)
 {
int pos = PCI_CAPABILITY_LIST;
int cnt = 48;   /* Maximal number of capabilities */
u32 status, id;
 
-   if (!pdn)
-   return 0;
-
/* Check if the device supports capabilities */
-   pnv_pci_cfg_read(pdn, PCI_STATUS, 2, &status);
+   pnv_eeh_read_config(edev, PCI_STATUS, 2, &status);
if (!(status & PCI_STATUS_CAP_LIST))
return 0;
 
while (cnt--) {
-   pnv_pci_cfg_read(pdn, pos, 1, &pos);
+   pnv_eeh_read_config(edev, pos, 1, &pos);
if (pos < 0x40)
break;
 
pos &= ~3;
-   pnv_pci_cfg_read(pdn, pos + PCI_CAP_LIST_ID, 1, &id);
+   pnv_eeh_read_config(edev, pos + PCI_CAP_LIST_ID, 1, &id);
if (id == 0xff)
break;
 
@@ -313,15 +359,14 @@ static int pnv_eeh_find_cap(struct pci_dn *pdn, int cap)
return 0;
 }
 
-static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap)
+static int pnv_eeh_find_ecap(struct eeh_dev *edev, int cap)
 {
-   struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
u32 header;
int pos = 256, ttl = (4096 - 256) / 8;
 
if (!edev || !edev->pcie_cap)
return 0;
-   if (pnv_pci_cfg_read(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+   if (pnv_eeh_read_config(edev, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
return 0;
else if (!header)
return 0;
@@ -334,7 +379,7 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap)
if (pos < 256)
break;
 
-   if (pnv_pci_cfg_read(pdn, pos, 4, &header) != 
PCIBIOS_SUCCESSFUL)
+   if (pnv_eeh_read_config(edev, pos, 4, &header) != 
PCIBIOS_SUCCESSFUL)
break;
}
 
@@ -382,15 +427,14 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev 
*pdev)
 
/* Initialize eeh device */
edev->class_code = pdn->class_code;
-   edev->mode  &= 0xFF00;
-   edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
-   edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
-   edev->af_cap   = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF);
-   edev->aer_cap  = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
+   edev->pcix_cap = pnv_eeh_find_cap(edev, PCI_CAP_ID_PCIX);
+   edev->pcie_cap = pnv_eeh_find_cap(edev, PCI_CAP_ID_EXP);
+   edev->af_cap   = pnv_eeh_find_cap(edev, PCI_CAP_ID_AF);
+   edev->aer_cap  = pnv_eeh_find_ecap(edev, PCI_EXT_CAP_ID_ERR);
if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI)

[Very RFC 16/46] powernv/eeh: Use eeh_edev_warn() rather than open-coding a BDFN print

2019-11-19 Thread Oliver O'Halloran
Neaten things up a bit and remove a pci_dn use.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index aaccb3768393..f58fe6bda46e 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1001,10 +1001,8 @@ static void pnv_eeh_wait_for_pending(struct eeh_dev 
*edev, const char *type,
msleep((1 << i) * 100);
}
 
-   pr_warn("%s: Pending transaction while issuing %sFLR to 
%04x:%02x:%02x.%01x\n",
-   __func__, type,
-   pdn->phb->global_number, pdn->busno,
-   PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+   eeh_edev_warn(edev, "%s: Pending transaction while issuing %sFLR\n",
+   __func__, type);
 }
 
 static int pnv_eeh_do_flr(struct eeh_dev *edev, int option)
-- 
2.21.0



[Very RFC 17/46] powernv/eeh: add pnv_eeh_find_edev()

2019-11-19 Thread Oliver O'Halloran
To get away from using pci_dn we need a way to find the edev for a given
bdfn. The easiest way to do this is to find the ioda_pe for that BDFN in
the PHB's reverse mapping table and scan the device list of the
corresponding eeh_pe.

Is this slow? Yeah probably. Is it slower than the existing "traverse the
pdn tree" method? Probably not.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 31 
 arch/powerpc/platforms/powernv/pci.h |  2 ++
 2 files changed, 33 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index f58fe6bda46e..a974822c5097 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -278,6 +278,37 @@ int pnv_eeh_post_init(void)
return ret;
 }
 
+struct eeh_dev *pnv_eeh_find_edev(struct pnv_phb *phb, u16 bdfn)
+{
+   struct pnv_ioda_pe *ioda_pe;
+   struct eeh_dev *tmp, *edev;
+   struct eeh_pe *pe;
+
+   /* EEH not enabled ? */
+   if (!(phb->flags & PNV_PHB_FLAG_EEH))
+   return NULL;
+
+   /* Fish the EEH PE from the IODA PE */
+   ioda_pe = __pnv_ioda_get_pe(phb, bdfn);
+   if (!ioda_pe)
+   return NULL;
+
+   /*
+* FIXME: Doing a tree-traversal followed by a list traversal
+* on every config access is dumb. Not much dumber than the pci_dn
+* tree traversal we did before, but still quite dumb.
+*/
+   pe = eeh_pe_get(phb->hose, ioda_pe->pe_number, 0);
+   if (!pe)
+   return NULL;
+
+   eeh_pe_for_each_dev(pe, edev, tmp)
+   if (edev->bdfn == bdfn)
+   return edev;
+
+   return NULL;
+}
+
 static inline bool pnv_eeh_cfg_blocked(struct eeh_dev *edev)
 {
if (!edev || !edev->pe)
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 3c33a0c91a69..a343f3c8e65c 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -196,6 +196,8 @@ extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, 
unsigned int virq);
 extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
__u64 window_size, __u32 levels);
 extern int pnv_eeh_post_init(void);
+struct eeh_dev;
+struct eeh_dev *pnv_eeh_find_edev(struct pnv_phb *phb, u16 bdfn);
 
 __printf(3, 4)
 extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
-- 
2.21.0



[Very RFC 18/46] powernv/pci: Add pci_bus_to_pnvhb() helper

2019-11-19 Thread Oliver O'Halloran
Add a helper to go from a pci_bus structure to the pnv_phb that hosts that
bus. There's a lot of instances of the following pattern:

struct pci_controller *hose = pci_bus_to_host(pdev->bus);
struct pnv_phb *phb = hose->private_data;

Without any other uses of the pci_controller inside the function. This is
hard to read since it requires you to memorise the contents of the
private data fields and kind of error prone since it involves blindly
assigning a void pointer. Add a helper to make it more concise and
explict.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 88 +++
 arch/powerpc/platforms/powernv/pci.c  | 18 ++---
 arch/powerpc/platforms/powernv/pci.h  | 10 +++
 3 files changed, 39 insertions(+), 77 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index c74521e5f3ab..a1c9315f3208 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -252,8 +252,7 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb)
 static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
 unsigned long *pe_bitmap)
 {
-   struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-   struct pnv_phb *phb = hose->private_data;
+   struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
struct resource *r;
resource_size_t base, sgsz, start, end;
int segno, i;
@@ -351,8 +350,7 @@ static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
 
 static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
 {
-   struct pci_controller *hose = pci_bus_to_host(bus);
-   struct pnv_phb *phb = hose->private_data;
+   struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
struct pnv_ioda_pe *master_pe, *pe;
unsigned long size, *pe_alloc;
int i;
@@ -673,8 +671,7 @@ struct pnv_ioda_pe *__pnv_ioda_get_pe(struct pnv_phb *phb, 
u16 bdfn)
 
 struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 {
-   struct pci_controller *hose = pci_bus_to_host(dev->bus);
-   struct pnv_phb *phb = hose->private_data;
+   struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
struct pci_dn *pdn = pci_get_pdn(dev);
 
if (!pdn)
@@ -1053,8 +1050,7 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, 
int offset)
 
 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 {
-   struct pci_controller *hose = pci_bus_to_host(dev->bus);
-   struct pnv_phb *phb = hose->private_data;
+   struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
struct pci_dn *pdn = pci_get_pdn(dev);
struct pnv_ioda_pe *pe;
 
@@ -1113,8 +1109,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct 
pci_dev *dev)
  */
 static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
 {
-   struct pci_controller *hose = pci_bus_to_host(bus);
-   struct pnv_phb *phb = hose->private_data;
+   struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
struct pnv_ioda_pe *pe = NULL;
unsigned int pe_num;
 
@@ -1181,8 +1176,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct 
pci_dev *npu_pdev)
struct pnv_ioda_pe *pe;
struct pci_dev *gpu_pdev;
struct pci_dn *npu_pdn;
-   struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
-   struct pnv_phb *phb = hose->private_data;
+   struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus);
 
/*
 * Due to a hardware errata PE#0 on the NPU is reserved for
@@ -1279,16 +1273,12 @@ static void pnv_pci_ioda_setup_PEs(void)
 #ifdef CONFIG_PCI_IOV
 static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
 {
-   struct pci_bus*bus;
-   struct pci_controller *hose;
struct pnv_phb*phb;
struct pci_dn *pdn;
inti, j;
intm64_bars;
 
-   bus = pdev->bus;
-   hose = pci_bus_to_host(bus);
-   phb = hose->private_data;
+   phb = pci_bus_to_pnvhb(pdev->bus);
pdn = pci_get_pdn(pdev);
 
if (pdn->m64_single_mode)
@@ -1312,8 +1302,6 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev, 
u16 num_vfs)
 
 static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 {
-   struct pci_bus*bus;
-   struct pci_controller *hose;
struct pnv_phb*phb;
struct pci_dn *pdn;
unsigned int   win;
@@ -1325,9 +1313,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, 
u16 num_vfs)
intpe_num;
intm64_bars;
 
-   bus = pdev->bus;
-   hose = pci_bus_to_host(bus);
-   phb = hose->private_data;
+   phb = pci_bus_to_pnvhb(pdev->bus);
pdn = pci_get_pdn(

[Very RFC 19/46] powernv/eeh: Use standard PCI capability lookup functions

2019-11-19 Thread Oliver O'Halloran
We have a pci_dev so we can use the functions provided by the PCI core for
looking up capabilities. This should be safe since these are only called
when initialising the eeh_dev when the device is first probed and not in
the EEH recovery path where config accesses are blocked.

This might cause a problem if an EEH event occured while probing the device,
but I'm pretty sure that's going to be broken anyway.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 67 ++--
 1 file changed, 4 insertions(+), 63 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index a974822c5097..b79aca8368c6 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -358,65 +358,6 @@ static int pnv_eeh_write_config(struct eeh_dev *edev,
return pnv_pci_cfg_write(pdn, where, size, val);
 }
 
-static int pnv_eeh_find_cap(struct eeh_dev *edev, int cap)
-{
-   int pos = PCI_CAPABILITY_LIST;
-   int cnt = 48;   /* Maximal number of capabilities */
-   u32 status, id;
-
-   /* Check if the device supports capabilities */
-   pnv_eeh_read_config(edev, PCI_STATUS, 2, &status);
-   if (!(status & PCI_STATUS_CAP_LIST))
-   return 0;
-
-   while (cnt--) {
-   pnv_eeh_read_config(edev, pos, 1, &pos);
-   if (pos < 0x40)
-   break;
-
-   pos &= ~3;
-   pnv_eeh_read_config(edev, pos + PCI_CAP_LIST_ID, 1, &id);
-   if (id == 0xff)
-   break;
-
-   /* Found */
-   if (id == cap)
-   return pos;
-
-   /* Next one */
-   pos += PCI_CAP_LIST_NEXT;
-   }
-
-   return 0;
-}
-
-static int pnv_eeh_find_ecap(struct eeh_dev *edev, int cap)
-{
-   u32 header;
-   int pos = 256, ttl = (4096 - 256) / 8;
-
-   if (!edev || !edev->pcie_cap)
-   return 0;
-   if (pnv_eeh_read_config(edev, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
-   return 0;
-   else if (!header)
-   return 0;
-
-   while (ttl-- > 0) {
-   if (PCI_EXT_CAP_ID(header) == cap && pos)
-   return pos;
-
-   pos = PCI_EXT_CAP_NEXT(header);
-   if (pos < 256)
-   break;
-
-   if (pnv_eeh_read_config(edev, pos, 4, &header) != 
PCIBIOS_SUCCESSFUL)
-   break;
-   }
-
-   return 0;
-}
-
 /**
  * pnv_eeh_probe - Do probe on PCI device
  * @pdev: pci_dev to probe
@@ -458,10 +399,10 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev 
*pdev)
 
/* Initialize eeh device */
edev->class_code = pdn->class_code;
-   edev->pcix_cap = pnv_eeh_find_cap(edev, PCI_CAP_ID_PCIX);
-   edev->pcie_cap = pnv_eeh_find_cap(edev, PCI_CAP_ID_EXP);
-   edev->af_cap   = pnv_eeh_find_cap(edev, PCI_CAP_ID_AF);
-   edev->aer_cap  = pnv_eeh_find_ecap(edev, PCI_EXT_CAP_ID_ERR);
+   edev->pcix_cap = pci_find_capability(pdev, PCI_CAP_ID_PCIX);
+   edev->pcie_cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+   edev->af_cap   = pci_find_capability(pdev, PCI_CAP_ID_AF);
+   edev->aer_cap  = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR);
if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
edev->mode |= EEH_DEV_BRIDGE;
if (edev->pcie_cap) {
-- 
2.21.0



[Very RFC 20/46] powernv/eeh: Look up device info from pci_dev

2019-11-19 Thread Oliver O'Halloran
Most of what we fetch from the pci_dn is also in the pci_dev structure. Convert
the pnv_eeh_probe_pdev() to use the pdev fields rather than the pci_dn so we can
get rid of pci_dn eventually.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 26 ++--
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index b79aca8368c6..6ba74836a9f8 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -372,7 +372,7 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev 
*pdev)
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
uint32_t pcie_flags;
int ret;
-   int config_addr = (pdn->busno << 8) | (pdn->devfn);
+   int config_addr = (pdev->bus->number << 8) | (pdev->devfn);
 
/*
 * When probing the root bridge, which doesn't have any
@@ -392,18 +392,18 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev 
*pdev)
}
 
/* Skip for PCI-ISA bridge */
-   if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
+   if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
return NULL;
 
eeh_edev_dbg(edev, "Probing device\n");
 
/* Initialize eeh device */
-   edev->class_code = pdn->class_code;
+   edev->class_code = pdev->class;
edev->pcix_cap = pci_find_capability(pdev, PCI_CAP_ID_PCIX);
edev->pcie_cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
edev->af_cap   = pci_find_capability(pdev, PCI_CAP_ID_AF);
edev->aer_cap  = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR);
-   if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
+   if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
edev->mode |= EEH_DEV_BRIDGE;
if (edev->pcie_cap) {
pnv_eeh_read_config(edev, edev->pcie_cap + 
PCI_EXP_FLAGS,
@@ -443,14 +443,14 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev 
*pdev)
 * Broadcom Shiner 4-ports 1G NICs (14e4:168a)
 * Broadcom Shiner 2-ports 10G NICs (14e4:168e)
 */
-   if ((pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
-pdn->device_id == 0x1656) ||
-   (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
-pdn->device_id == 0x1657) ||
-   (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
-pdn->device_id == 0x168a) ||
-   (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
-pdn->device_id == 0x168e))
+   if ((pdev->vendor == PCI_VENDOR_ID_BROADCOM &&
+pdev->device == 0x1656) ||
+   (pdev->vendor == PCI_VENDOR_ID_BROADCOM &&
+pdev->device == 0x1657) ||
+   (pdev->vendor == PCI_VENDOR_ID_BROADCOM &&
+pdev->device == 0x168a) ||
+   (pdev->vendor == PCI_VENDOR_ID_BROADCOM &&
+pdev->device == 0x168e))
edev->pe->state |= EEH_PE_CFG_RESTRICTED;
 
/*
@@ -461,7 +461,7 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev 
*pdev)
 */
if (!(edev->pe->state & EEH_PE_PRI_BUS)) {
edev->pe->bus = pci_find_bus(hose->global_number,
-pdn->busno);
+pdev->bus->number);
if (edev->pe->bus)
edev->pe->state |= EEH_PE_PRI_BUS;
}
-- 
2.21.0



[Very RFC 21/46] powernv/eeh: Rework finding an existing edev in probe_pdev()

2019-11-19 Thread Oliver O'Halloran
Use the pnv_eeh_find_edev() helper to look up the eeh_dev for a device
rather than doing it via the pci_dn.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 44 ++--
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6ba74836a9f8..1cd80b35 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -374,20 +374,40 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev 
*pdev)
int ret;
int config_addr = (pdev->bus->number << 8) | (pdev->devfn);
 
+   pci_dbg(pdev, "%s: probing\n", __func__);
+
/*
-* When probing the root bridge, which doesn't have any
-* subordinate PCI devices. We don't have OF node for
-* the root bridge. So it's not reasonable to continue
-* the probing.
+* EEH keeps the eeh_dev alive over a recovery pass even when the
+* corresponding pci_dev has been torn down. In that case we need
+* to find the existing eeh_dev and re-bind the two.
 */
-   if (!edev || edev->pe)
-   return NULL;
+   edev = pnv_eeh_find_edev(phb, config_addr);
+   if (edev) {
+   eeh_edev_dbg(edev, "Found existing edev!\n");
+
+   /*
+* XXX: eeh_remove_device() clears pdev so we shouldn't hit this
+* normally. I've found that screwing around with the pci probe
+* path can result in eeh_probe_pdev() being called twice. This
+* is harmless at the moment, but it's pretty strange so emit a
+* warning to be on the safe side.
+*/
+   if (WARN_ON(edev->pdev))
+   eeh_edev_dbg(edev, "%s: already bound to a pdev!\n", 
__func__);
+
+   edev->pdev = pdev;
+
+   /* should we be doing something with REMOVED too? */
+   edev->mode &= EEH_DEV_DISCONNECTED;
+
+   /* update the primary bus if we need to */
+   // XXX: why do we need to do this? is the pci_bus going away? 
what cleared the flag?
+   if (!(edev->pe->state & EEH_PE_PRI_BUS)) {
+   edev->pe->bus = pdev->bus;
+   if (edev->pe->bus)
+   edev->pe->state |= EEH_PE_PRI_BUS;
+   }
 
-   /* already configured? */
-   if (edev->pdev) {
-   pr_debug("%s: found existing edev for %04x:%02x:%02x.%01x\n",
-   __func__, hose->global_number, config_addr >> 8,
-   PCI_SLOT(config_addr), PCI_FUNC(config_addr));
return edev;
}
 
@@ -395,8 +415,6 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev 
*pdev)
if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
return NULL;
 
-   eeh_edev_dbg(edev, "Probing device\n");
-
/* Initialize eeh device */
edev->class_code = pdev->class;
edev->pcix_cap = pci_find_capability(pdev, PCI_CAP_ID_PCIX);
-- 
2.21.0



[Very RFC 22/46] powernv/eeh: Allocate eeh_dev's when needed

2019-11-19 Thread Oliver O'Halloran
Have the PowerNV EEH backend allocate the eeh_dev if needed rather than using
the one attached to the pci_dn. This gets us most of the way towards decoupling
pci_dn from the PowerNV EEH code.

Signed-off-by: Oliver O'Halloran 
---
We should probably be free()ing the eeh_dev somewhere. The pci_dev release
function is the right place for it.
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 22 
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 1cd80b35..7aba18e08996 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -366,10 +366,9 @@ static int pnv_eeh_write_config(struct eeh_dev *edev,
  */
 static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev *pdev)
 {
-   struct pci_dn *pdn = pci_get_pdn(pdev);
-   struct pci_controller *hose = pdn->phb;
-   struct pnv_phb *phb = hose->private_data;
-   struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+   struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+   struct pci_controller *hose = phb->hose;
+   struct eeh_dev *edev;
uint32_t pcie_flags;
int ret;
int config_addr = (pdev->bus->number << 8) | (pdev->devfn);
@@ -415,12 +414,27 @@ static struct eeh_dev *pnv_eeh_probe_pdev(struct pci_dev 
*pdev)
if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
return NULL;
 
+   /* otherwise allocate and initialise a new eeh_dev */
+   edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+   if (!edev) {
+   pr_err("%s: out of memory lol\n", __func__);
+   return NULL;
+   }
+
/* Initialize eeh device */
+   edev->bdfn   = config_addr;
+   edev->controller = phb->hose;
+
edev->class_code = pdev->class;
edev->pcix_cap = pci_find_capability(pdev, PCI_CAP_ID_PCIX);
edev->pcie_cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
edev->af_cap   = pci_find_capability(pdev, PCI_CAP_ID_AF);
edev->aer_cap  = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ERR);
+
+   /* TODO: stash the vf_index in here? */
+   if (pdev->is_virtfn)
+   edev->physfn = pdev->physfn;
+
if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
edev->mode |= EEH_DEV_BRIDGE;
if (edev->pcie_cap) {
-- 
2.21.0



[Very RFC 23/46] powerpc/eeh: Moving finding the parent PE into the platform

2019-11-19 Thread Oliver O'Halloran
Currently the generic EEH code uses the pci_dn of a device to look up the
PE of the device's parent bridge, or physical function. The generic
function to insert the edev (and possibly create the eeh_pe) is called
from the probe functions already so this is a relatively minor change.

The existing lookup method moves into the pseries platform and PowerNV
can choose the PE based on the bus heirachy instead.

Signed-off-by: Oliver O'Halloran 
---
"parent" meaning "parent of the PE that actually contains this edev"
is stupid, but it's stupid consistent with what's there already. Also
I couldn't think of a way to fix it without adding a bunch of boring
boilerplate at the call sites.

FIXME: I think I introduced a bug here. Currently we coalase a switch's
upstream port bus and the downstream port bus into a single PE since
they're a single failure domain. That seems to have been broken by
this patch, but whatever.
---
 arch/powerpc/include/asm/eeh.h   |  2 +-
 arch/powerpc/kernel/eeh_pe.c | 54 -
 arch/powerpc/platforms/powernv/eeh-powernv.c | 25 +++-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 61 
 4 files changed, 86 insertions(+), 56 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index e109bfd3dd57..70d3e01dbe9d 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -295,7 +295,7 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
 struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
  int pe_no, int config_addr);
-int eeh_add_to_parent_pe(struct eeh_dev *edev);
+int eeh_add_to_parent_pe(struct eeh_pe *parent, struct eeh_dev *edev);
 int eeh_rmv_from_parent_pe(struct eeh_dev *edev);
 void eeh_pe_update_time_stamp(struct eeh_pe *pe);
 void *eeh_pe_traverse(struct eeh_pe *root,
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 831f363f1732..520c249f19d3 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -318,56 +318,23 @@ struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
return pe;
 }
 
-/**
- * eeh_pe_get_parent - Retrieve the parent PE
- * @edev: EEH device
- *
- * The whole PEs existing in the system are organized as hierarchy
- * tree. The function is used to retrieve the parent PE according
- * to the parent EEH device.
- */
-static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
-{
-   struct eeh_dev *parent;
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
-
-   /*
-* It might have the case for the indirect parent
-* EEH device already having associated PE, but
-* the direct parent EEH device doesn't have yet.
-*/
-   if (edev->physfn)
-   pdn = pci_get_pdn(edev->physfn);
-   else
-   pdn = pdn ? pdn->parent : NULL;
-   while (pdn) {
-   /* We're poking out of PCI territory */
-   parent = pdn_to_eeh_dev(pdn);
-   if (!parent)
-   return NULL;
-
-   if (parent->pe)
-   return parent->pe;
-
-   pdn = pdn->parent;
-   }
-
-   return NULL;
-}
-
 /**
  * eeh_add_to_parent_pe - Add EEH device to parent PE
+ * @parent: PE to create additional PEs under
  * @edev: EEH device
  *
- * Add EEH device to the parent PE. If the parent PE already
- * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
- * we have to create new PE to hold the EEH device and the new
- * PE will be linked to its parent PE as well.
+ * Add EEH device to the PE in edev->pe_config_addr. If the PE
+ * already exists then we'll add it to that. Otherwise a new
+ * PE is created, and inserted into the PE tree below @parent.
+ * If @parent is NULL, then it will be inserted under the PHB
+ * PE for edev->controller.
+ *
+ * In either case @edev is added to the PE's device list.
  */
-int eeh_add_to_parent_pe(struct eeh_dev *edev)
+int eeh_add_to_parent_pe(struct eeh_pe *parent, struct eeh_dev *edev)
 {
int config_addr = edev->bdfn;
-   struct eeh_pe *pe, *parent;
+   struct eeh_pe *pe;
 
/* Check if the PE number is valid */
if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) {
@@ -431,7 +398,6 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 * to PHB directly. Otherwise, we have to associate the
 * PE with its parent.
 */
-   parent = eeh_pe_get_parent(edev);
if (!parent) {
parent = eeh_phb_pe_get(edev->controller);
if (!parent) {
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 7aba18e08996..49a932ff092a 100644
--- a/arch/powerp

[Very RFC 24/46] powernv/pci: Make the pre-cfg EEH freeze check use eeh_dev rather than pci_dn

2019-11-19 Thread Oliver O'Halloran
Squash another usage in preperation for making the config accessors pci_dn.

Signed-off-by: Oliver O'Halloran 
---
We might want to move this into eeh-powernv.c
---
 arch/powerpc/platforms/powernv/pci.c | 37 +---
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index d36dde9777aa..6170677bfdc7 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -708,30 +708,23 @@ int pnv_pci_cfg_write(struct pci_dn *pdn,
 }
 
 #if CONFIG_EEH
-static bool pnv_pci_cfg_check(struct pci_dn *pdn)
+bool pnv_eeh_pre_cfg_check(struct eeh_dev *edev)
 {
-   struct eeh_dev *edev = NULL;
-   struct pnv_phb *phb = pdn->phb->private_data;
-
-   /* EEH not enabled ? */
-   if (!(phb->flags & PNV_PHB_FLAG_EEH))
+   if (!edev || !edev->pe)
return true;
 
-   /* PE reset or device removed ? */
-   edev = pdn->edev;
-   if (edev) {
-   if (edev->pe &&
-   (edev->pe->state & EEH_PE_CFG_BLOCKED))
-   return false;
+   /* PE in reset? */
+   if (edev->pe->state & EEH_PE_CFG_BLOCKED)
+   return false;
 
-   if (edev->mode & EEH_DEV_REMOVED)
-   return false;
-   }
+   /* Device removed? */
+   if (edev->mode & EEH_DEV_REMOVED)
+   return false;
 
return true;
 }
 #else
-static inline pnv_pci_cfg_check(struct pci_dn *pdn)
+static inline pnv_pci_cfg_check(struct eeh_dev *edev)
 {
return true;
 }
@@ -743,6 +736,7 @@ static int pnv_pci_read_config(struct pci_bus *bus,
 {
struct pci_dn *pdn;
struct pnv_phb *phb;
+   struct eeh_dev *edev;
int ret;
 
*val = 0x;
@@ -750,14 +744,15 @@ static int pnv_pci_read_config(struct pci_bus *bus,
if (!pdn)
return PCIBIOS_DEVICE_NOT_FOUND;
 
-   if (!pnv_pci_cfg_check(pdn))
+   edev = pdn_to_eeh_dev(pdn);
+   if (!pnv_eeh_pre_cfg_check(edev))
return PCIBIOS_DEVICE_NOT_FOUND;
 
ret = pnv_pci_cfg_read(pdn, where, size, val);
phb = pdn->phb->private_data;
-   if (phb->flags & PNV_PHB_FLAG_EEH && pdn->edev) {
+   if (phb->flags & PNV_PHB_FLAG_EEH && edev) {
if (*val == EEH_IO_ERROR_VALUE(size) &&
-   eeh_dev_check_failure(pdn->edev))
+   eeh_dev_check_failure(edev))
 return PCIBIOS_DEVICE_NOT_FOUND;
} else {
pnv_pci_config_check_eeh(pdn);
@@ -772,13 +767,15 @@ static int pnv_pci_write_config(struct pci_bus *bus,
 {
struct pci_dn *pdn;
struct pnv_phb *phb;
+   struct eeh_dev *edev;
int ret;
 
pdn = pci_get_pdn_by_devfn(bus, devfn);
if (!pdn)
return PCIBIOS_DEVICE_NOT_FOUND;
 
-   if (!pnv_pci_cfg_check(pdn))
+   edev = pdn_to_eeh_dev(pdn);
+   if (!pnv_eeh_pre_cfg_check(edev))
return PCIBIOS_DEVICE_NOT_FOUND;
 
ret = pnv_pci_cfg_write(pdn, where, size, val);
-- 
2.21.0



[Very RFC 25/46] powernv/pci: Remove pdn from pnv_pci_config_check_eeh()

2019-11-19 Thread Oliver O'Halloran
Despite the name this function is generic PowerNV PCI code rather than anything
EEH specific. Convert to take a phb and bdfn rather than a pci_dn.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci.c | 32 ++--
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 6170677bfdc7..50142ff045ac 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -591,9 +591,15 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, 
u32 pe_no)
spin_unlock_irqrestore(&phb->lock, flags);
 }
 
-static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
+/*
+ * This, very strangely named, function checks if a config access
+ * caused an EEH and un-freezes the PE if it did. This is mainly
+ * for the !CONFIG_EEH case where nothing is going to un-freeze
+ * it for us.
+ */
+static void pnv_pci_config_check_eeh(struct pnv_phb *phb, u16 bdfn)
 {
-   struct pnv_phb *phb = pdn->phb->private_data;
+   struct pnv_ioda_pe *ioda_pe;
u8  fstate = 0;
__be16  pcierr = 0;
unsigned int pe_no;
@@ -604,10 +610,11 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
 * setup that yet. So all ER errors should be mapped to
 * reserved PE.
 */
-   pe_no = pdn->pe_number;
-   if (pe_no == IODA_INVALID_PE) {
+   ioda_pe = __pnv_ioda_get_pe(phb, bdfn);
+   if (ioda_pe)
+   pe_no = ioda_pe->pe_number;
+   else
pe_no = phb->ioda.reserved_pe_idx;
-   }
 
/*
 * Fetch frozen state. If the PHB support compound PE,
@@ -629,7 +636,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
}
 
pr_devel(" -> EEH check, bdfn=%04x PE#%x fstate=%x\n",
-(pdn->busno << 8) | (pdn->devfn), pe_no, fstate);
+bdfn, pe_no, fstate);
 
/* Clear the frozen state if applicable */
if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE ||
@@ -642,6 +649,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
if (phb->freeze_pe)
phb->freeze_pe(phb, pe_no);
 
+   /* fish out the EEH log and send an EEH event. */
pnv_pci_handle_eeh_config(phb, pe_no);
}
 }
@@ -735,7 +743,8 @@ static int pnv_pci_read_config(struct pci_bus *bus,
   int where, int size, u32 *val)
 {
struct pci_dn *pdn;
-   struct pnv_phb *phb;
+   struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
+   u16 bdfn = bus->number << 8 | devfn;
struct eeh_dev *edev;
int ret;
 
@@ -755,7 +764,7 @@ static int pnv_pci_read_config(struct pci_bus *bus,
eeh_dev_check_failure(edev))
 return PCIBIOS_DEVICE_NOT_FOUND;
} else {
-   pnv_pci_config_check_eeh(pdn);
+   pnv_pci_config_check_eeh(phb, bdfn);
}
 
return ret;
@@ -766,7 +775,8 @@ static int pnv_pci_write_config(struct pci_bus *bus,
int where, int size, u32 val)
 {
struct pci_dn *pdn;
-   struct pnv_phb *phb;
+   struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
+   u16 bdfn = bus->number << 8 | devfn;
struct eeh_dev *edev;
int ret;
 
@@ -779,9 +789,9 @@ static int pnv_pci_write_config(struct pci_bus *bus,
return PCIBIOS_DEVICE_NOT_FOUND;
 
ret = pnv_pci_cfg_write(pdn, where, size, val);
-   phb = pdn->phb->private_data;
+
if (!(phb->flags & PNV_PHB_FLAG_EEH))
-   pnv_pci_config_check_eeh(pdn);
+   pnv_pci_config_check_eeh(phb, bdfn);
 
return ret;
 }
-- 
2.21.0



[Very RFC 26/46] powernv/pci: Remove pdn from pnv_pci_cfg_{read|write}

2019-11-19 Thread Oliver O'Halloran
Remove the use of pci_dn from the low-level config space access functions.
These are used by the eeh's config ops and the bus config ops that we
provide to the PCI core.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 14 +++
 arch/powerpc/platforms/powernv/pci.c | 26 
 arch/powerpc/platforms/powernv/pci.h |  6 ++---
 3 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 49a932ff092a..8a73bc7517c5 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -331,31 +331,25 @@ static inline bool pnv_eeh_cfg_blocked(struct eeh_dev 
*edev)
 static int pnv_eeh_read_config(struct eeh_dev *edev,
   int where, int size, u32 *val)
 {
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
-
-   if (!pdn)
-   return PCIBIOS_DEVICE_NOT_FOUND;
+   struct pnv_phb *phb = edev->controller->private_data;
 
if (pnv_eeh_cfg_blocked(edev)) {
*val = 0x;
return PCIBIOS_SET_FAILED;
}
 
-   return pnv_pci_cfg_read(pdn, where, size, val);
+   return pnv_pci_cfg_read(phb, edev->bdfn, where, size, val);
 }
 
 static int pnv_eeh_write_config(struct eeh_dev *edev,
int where, int size, u32 val)
 {
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
-
-   if (!pdn)
-   return PCIBIOS_DEVICE_NOT_FOUND;
+   struct pnv_phb *phb = edev->controller->private_data;
 
if (pnv_eeh_cfg_blocked(edev))
return PCIBIOS_SET_FAILED;
 
-   return pnv_pci_cfg_write(pdn, where, size, val);
+   return pnv_pci_cfg_write(phb, edev->bdfn, where, size, val);
 }
 
 static struct eeh_pe *pnv_eeh_pe_get_parent(struct pci_dev *pdev)
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 50142ff045ac..36eea4bb514c 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -654,11 +654,9 @@ static void pnv_pci_config_check_eeh(struct pnv_phb *phb, 
u16 bdfn)
}
 }
 
-int pnv_pci_cfg_read(struct pci_dn *pdn,
+int pnv_pci_cfg_read(struct pnv_phb *phb, u16 bdfn,
 int where, int size, u32 *val)
 {
-   struct pnv_phb *phb = pdn->phb->private_data;
-   u32 bdfn = (pdn->busno << 8) | pdn->devfn;
s64 rc;
 
switch (size) {
@@ -685,19 +683,16 @@ int pnv_pci_cfg_read(struct pci_dn *pdn,
return PCIBIOS_FUNC_NOT_SUPPORTED;
}
 
-   pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
-__func__, pdn->busno, pdn->devfn, where, size, *val);
+   pr_devel("%s: bdfn: %x  +%x/%x -> %08x\n",
+__func__, bdfn, where, size, *val);
return PCIBIOS_SUCCESSFUL;
 }
 
-int pnv_pci_cfg_write(struct pci_dn *pdn,
+int pnv_pci_cfg_write(struct pnv_phb *phb, u16 bdfn,
  int where, int size, u32 val)
 {
-   struct pnv_phb *phb = pdn->phb->private_data;
-   u32 bdfn = (pdn->busno << 8) | pdn->devfn;
-
-   pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
-__func__, pdn->busno, pdn->devfn, where, size, val);
+   pr_devel("%s: bdfn: %x +%x/%x -> %08x\n",
+__func__, bdfn, where, size, val);
switch (size) {
case 1:
opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
@@ -753,12 +748,11 @@ static int pnv_pci_read_config(struct pci_bus *bus,
if (!pdn)
return PCIBIOS_DEVICE_NOT_FOUND;
 
-   edev = pdn_to_eeh_dev(pdn);
+   edev = pnv_eeh_find_edev(phb, bdfn);
if (!pnv_eeh_pre_cfg_check(edev))
return PCIBIOS_DEVICE_NOT_FOUND;
 
-   ret = pnv_pci_cfg_read(pdn, where, size, val);
-   phb = pdn->phb->private_data;
+   ret = pnv_pci_cfg_read(phb, bdfn, where, size, val);
if (phb->flags & PNV_PHB_FLAG_EEH && edev) {
if (*val == EEH_IO_ERROR_VALUE(size) &&
eeh_dev_check_failure(edev))
@@ -784,11 +778,11 @@ static int pnv_pci_write_config(struct pci_bus *bus,
if (!pdn)
return PCIBIOS_DEVICE_NOT_FOUND;
 
-   edev = pdn_to_eeh_dev(pdn);
+   edev = pnv_eeh_find_edev(phb, bdfn);
if (!pnv_eeh_pre_cfg_check(edev))
return PCIBIOS_DEVICE_NOT_FOUND;
 
-   ret = pnv_pci_cfg_write(pdn, where, size, val);
+   ret = pnv_pci_cfg_write(phb, bdfn, where, size, val);
 
if (!(phb->flags & PNV_PHB_FLAG_EEH))
pnv_pci_config_check_eeh(phb, bdfn);
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index be435a8

[Very RFC 27/46] powernv/pci: Clear reserved PE freezes

2019-11-19 Thread Oliver O'Halloran
When we scan an empty slot the PHB gets an Unsupported Request from the
downstream bridge when there's no device present at that BDFN.  Some older
PHBs (p7-IOC) don't allow further config space accesses while the PE is
frozen, so clear it here without bothering with the diagnostic log.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 36eea4bb514c..5b1f4677cdce 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -642,6 +642,19 @@ static void pnv_pci_config_check_eeh(struct pnv_phb *phb, 
u16 bdfn)
if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE ||
fstate == OPAL_EEH_STOPPED_DMA_FREEZE  ||
fstate == OPAL_EEH_STOPPED_MMIO_DMA_FREEZE) {
+
+   /*
+* Scanning an empty slot will result in a freeze on the 
reserved PE.
+*
+* Some old and bad PHBs block config space access to frozen 
PEs in
+* addition to MMIOs, so unfreeze it here.
+*/
+   if (pe_no == phb->ioda.reserved_pe_idx) {
+   phb->unfreeze_pe(phb, phb->ioda.reserved_pe_idx,
+OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+   return;
+   }
+
/*
 * If PHB supports compound PE, freeze it for
 * consistency.
-- 
2.21.0



[Very RFC 28/46] powernv/iov: Move SR-IOV PF state out of pci_dn

2019-11-19 Thread Oliver O'Halloran
Move the SR-IOV into a platform specific structure. I'm sure stashing all the
SR-IOV state in pci_dn seemed like a good idea at the time, but it results in a
lot of powernv specifics being leaked out of the platform directory.

Moving all the PHB3/4 specific M64 BAR wrangling into a PowerNV specific
structure helps to clarify the role of pci_dn and ensures that the platform
specifics stay that way.

This will make the code easier to understand and modify since we don't need
to so much aboute PowerNV changes breaking pseries and EEH, and vis-a-vis.

Signed-off-by: Oliver O'Halloran 
---
TODO: Remove all the sriov stuff from pci_dn. We can't do that yet because
the pseries SRIOV support was a giant hack that re-used some of the
previously powernv specific fields.
---
 arch/powerpc/include/asm/device.h |   3 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 199 --
 arch/powerpc/platforms/powernv/pci.h  |  36 
 3 files changed, 148 insertions(+), 90 deletions(-)

diff --git a/arch/powerpc/include/asm/device.h 
b/arch/powerpc/include/asm/device.h
index 266542769e4b..4d8934db7ef5 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -49,6 +49,9 @@ struct dev_archdata {
 #ifdef CONFIG_CXL_BASE
struct cxl_context  *cxl_ctx;
 #endif
+#ifdef CONFIG_PCI_IOV
+   void *iov_data;
+#endif
 };
 
 struct pdev_archdata {
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index a1c9315f3208..1c90feed233d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -966,14 +966,15 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, 
struct pnv_ioda_pe *pe)
 #ifdef CONFIG_PCI_IOV
 static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 {
-   struct pci_dn *pdn = pci_get_pdn(dev);
-   int i;
struct resource *res, res2;
+   struct pnv_iov_data *iov;
resource_size_t size;
u16 num_vfs;
+   int i;
 
if (!dev->is_physfn)
return -EINVAL;
+   iov = pnv_iov_get(dev);
 
/*
 * "offset" is in VFs.  The M64 windows are sized so that when they
@@ -983,7 +984,7 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, 
int offset)
 * separate PE, and changing the IOV BAR start address changes the
 * range of PEs the VFs are in.
 */
-   num_vfs = pdn->num_vfs;
+   num_vfs = iov->num_vfs;
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &dev->resource[i + PCI_IOV_RESOURCES];
if (!res->flags || !res->parent)
@@ -1029,19 +1030,19 @@ static int pnv_pci_vf_resource_shift(struct pci_dev 
*dev, int offset)
 num_vfs, offset);
 
if (offset < 0) {
-   devm_release_resource(&dev->dev, &pdn->holes[i]);
-   memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
+   devm_release_resource(&dev->dev, &iov->holes[i]);
+   memset(&iov->holes[i], 0, sizeof(iov->holes[i]));
}
 
pci_update_resource(dev, i + PCI_IOV_RESOURCES);
 
if (offset > 0) {
-   pdn->holes[i].start = res2.start;
-   pdn->holes[i].end = res2.start + size * offset - 1;
-   pdn->holes[i].flags = IORESOURCE_BUS;
-   pdn->holes[i].name = "pnv_iov_reserved";
+   iov->holes[i].start = res2.start;
+   iov->holes[i].end = res2.start + size * offset - 1;
+   iov->holes[i].flags = IORESOURCE_BUS;
+   iov->holes[i].name = "pnv_iov_reserved";
devm_request_resource(&dev->dev, res->parent,
-   &pdn->holes[i]);
+   &iov->holes[i]);
}
}
return 0;
@@ -1273,37 +1274,37 @@ static void pnv_pci_ioda_setup_PEs(void)
 #ifdef CONFIG_PCI_IOV
 static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
 {
+   struct pnv_iov_data   *iov;
struct pnv_phb*phb;
-   struct pci_dn *pdn;
inti, j;
intm64_bars;
 
phb = pci_bus_to_pnvhb(pdev->bus);
-   pdn = pci_get_pdn(pdev);
+   iov = pnv_iov_get(pdev);
 
-   if (pdn->m64_single_mode)
+   if (iov->m64_single_mode)
m64_bars = num_vfs;
else
m64_bars = 1;
 
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
for (j = 0; j < m64_bars; j++) {
-   if (pdn->m64_map[j][

[Very RFC 29/46] powernv/pci: Remove open-coded PE lookup in PELT-V setup

2019-11-19 Thread Oliver O'Halloran
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 32 +--
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 1c90feed233d..5bd7c1b058da 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -760,6 +760,11 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
}
}
 
+   /*
+* Walk the bridges up to the root. Along the way mark this PE as
+* downstream of the bridge PE(s) so that errors upstream errors
+* also cause this PE to be frozen.
+*/
if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
pdev = pe->pbus->self;
else if (pe->flags & PNV_IODA_PE_DEV)
@@ -768,16 +773,27 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
else if (pe->flags & PNV_IODA_PE_VF)
pdev = pe->parent_dev;
 #endif /* CONFIG_PCI_IOV */
+
while (pdev) {
-   struct pci_dn *pdn = pci_get_pdn(pdev);
-   struct pnv_ioda_pe *parent;
+   struct pnv_ioda_pe *parent = pnv_ioda_get_pe(pdev);
 
-   if (pdn && pdn->pe_number != IODA_INVALID_PE) {
-   parent = &phb->ioda.pe_array[pdn->pe_number];
-   ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
-   if (ret)
-   return ret;
-   }
+   /*
+* FIXME: This is called from pcibios_setup_bridge(), which is 
called
+* from the bottom (leaf) bridge to the root. This means that 
this
+* doesn't actually setup the PELT-V entries since the PEs for
+* the bridges above assigned after this is run for the leaf.
+*
+* FIXMEFIXME: might not be true since moving PE configuration
+* into pcibios_bus_add_device().
+*/
+   if (!parent)
+   break;
+
+   WARN_ON(!parent || parent->pe_number == IODA_INVALID_PE);
+
+   ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
+   if (ret)
+   return ret;
 
pdev = pdev->bus->self;
}
-- 
2.21.0



[Very RFC 30/46] powernv/pci: Remove open-coded PE lookup in PELT-V teardown

2019-11-19 Thread Oliver O'Halloran
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5bd7c1b058da..d4b5ee926222 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -853,11 +853,13 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, 
struct pnv_ioda_pe *pe)
 
/* Release from all parents PELT-V */
while (parent) {
-   struct pci_dn *pdn = pci_get_pdn(parent);
-   if (pdn && pdn->pe_number != IODA_INVALID_PE) {
-   rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
-   pe->pe_number, 
OPAL_REMOVE_PE_FROM_DOMAIN);
-   /* XXX What to do in case of error ? */
+   struct pnv_ioda_pe *parent_pe = pnv_ioda_get_pe(parent);
+
+   if (parent_pe) {
+   rc = opal_pci_set_peltv(phb->opal_id,
+   parent_pe->pe_number,
+   pe->pe_number,
+   OPAL_REMOVE_PE_FROM_DOMAIN);
}
parent = parent->bus->self;
}
-- 
2.21.0



[Very RFC 31/46] powernv/pci: Remove open-coded PE lookup in pnv_pci_ioda_dma_dev_setup()

2019-11-19 Thread Oliver O'Halloran
Use the helper to look up the pnv_ioda_pe for the device we're configuring DMA
for. In the VF case there's no need set pdn->pe_number since nothing looks at
it any more.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index d4b5ee926222..98d858999a2d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1709,10 +1709,9 @@ int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 
num_vfs)
 
 static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev 
*pdev)
 {
-   struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *pe;
 
-   pe = &phb->ioda.pe_array[pdn->pe_number];
+   pe = pnv_ioda_get_pe(pdev);
WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
-- 
2.21.0



[Very RFC 32/46] powernv/pci: Remove open-coded PE lookup in iommu_bypass_supported()

2019-11-19 Thread Oliver O'Halloran
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 98d858999a2d..7e88de18ead6 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1801,13 +1801,11 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct 
pci_dev *pdev,
u64 dma_mask)
 {
struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
-   struct pci_dn *pdn = pci_get_pdn(pdev);
-   struct pnv_ioda_pe *pe;
+   struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
 
-   if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+   if (WARN_ON(!pe))
return false;
 
-   pe = &phb->ioda.pe_array[pdn->pe_number];
if (pe->tce_bypass_enabled) {
u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
if (dma_mask >= top)
-- 
2.21.0



[Very RFC 33/46] powernv/pci: Remove open-coded PE lookup in iommu notifier

2019-11-19 Thread Oliver O'Halloran
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 5b1f4677cdce..0eeea8652426 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -943,23 +943,22 @@ static int pnv_tce_iommu_bus_notifier(struct 
notifier_block *nb,
 {
struct device *dev = data;
struct pci_dev *pdev;
-   struct pci_dn *pdn;
struct pnv_ioda_pe *pe;
struct pnv_phb *phb;
 
switch (action) {
case BUS_NOTIFY_ADD_DEVICE:
pdev = to_pci_dev(dev);
-   pdn = pci_get_pdn(pdev);
phb = pci_bus_to_pnvhb(pdev->bus);
 
WARN_ON_ONCE(!phb);
-   if (!pdn || pdn->pe_number == IODA_INVALID_PE || !phb)
+   if (!phb)
return 0;
 
-   pe = &phb->ioda.pe_array[pdn->pe_number];
-   if (!pe->table_group.group)
+   pe = pnv_ioda_get_pe(pdev);
+   if (!pe || !pe->table_group.group)
return 0;
+
iommu_add_device(&pe->table_group, dev);
return 0;
case BUS_NOTIFY_DEL_DEVICE:
-- 
2.21.0



[Very RFC 34/46] powernv/pci: Remove open-coded PE lookup in pnv_pci_enable_device_hook()

2019-11-19 Thread Oliver O'Halloran
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7e88de18ead6..4f38652c7cd7 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3382,7 +3382,6 @@ static resource_size_t 
pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
 static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
 {
struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
-   struct pci_dn *pdn;
 
/* The function is probably called while the PEs have
 * not be created yet. For example, resource reassignment
@@ -3392,11 +3391,7 @@ static bool pnv_pci_enable_device_hook(struct pci_dev 
*dev)
if (!phb->initialized)
return true;
 
-   pdn = pci_get_pdn(dev);
-   if (!pdn || pdn->pe_number == IODA_INVALID_PE)
-   return false;
-
-   return true;
+   return !!pnv_ioda_get_pe(dev);
 }
 
 static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
-- 
2.21.0



[Very RFC 35/46] powernv/pci: Remove open-coded PE lookup in pnv_pci_release_device

2019-11-19 Thread Oliver O'Halloran
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 4f38652c7cd7..8525642b1256 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3562,14 +3562,14 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
 static void pnv_pci_release_device(struct pci_dev *pdev)
 {
struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+   struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
struct pci_dn *pdn = pci_get_pdn(pdev);
-   struct pnv_ioda_pe *pe;
 
/* The VF PE state is torn down when sriov_disable() is called */
if (pdev->is_virtfn)
return;
 
-   if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+   if (WARN_ON(!pe))
return;
 
/*
@@ -3588,7 +3588,6 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
 * be increased on adding devices. It leads to unbalanced PE's device
 * count and eventually make normal PCI hotplug path broken.
 */
-   pe = &phb->ioda.pe_array[pdn->pe_number];
pdn->pe_number = IODA_INVALID_PE;
 
WARN_ON(--pe->device_count < 0);
-- 
2.21.0



[Very RFC 36/46] powernv/npu: Remove open-coded PE lookup for GPU device

2019-11-19 Thread Oliver O'Halloran
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 13 ++---
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index b95b9e3c4c98..68bfaef44862 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -97,25 +97,16 @@ EXPORT_SYMBOL(pnv_pci_get_npu_dev);
 static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
  struct pci_dev **gpdev)
 {
-   struct pnv_phb *phb;
-   struct pci_controller *hose;
struct pci_dev *pdev;
struct pnv_ioda_pe *pe;
-   struct pci_dn *pdn;
 
pdev = pnv_pci_get_gpu_dev(npe->pdev);
if (!pdev)
return NULL;
 
-   pdn = pci_get_pdn(pdev);
-   if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-   return NULL;
-
-   hose = pci_bus_to_host(pdev->bus);
-   phb = hose->private_data;
-   pe = &phb->ioda.pe_array[pdn->pe_number];
+   pe = pnv_ioda_get_pe(pdev);
 
-   if (gpdev)
+   if (pe && pdev)
*gpdev = pdev;
 
return pe;
-- 
2.21.0



[Very RFC 37/46] powernv/pci: Use the PHB's rmap for pnv_ioda_to_pe()

2019-11-19 Thread Oliver O'Halloran
Rather than using the pdn->pe_number for a device as an index into the
IODA PE array we can use the reverse map. This maps the RID (i.e. bdfn)
to the PE number associated with it. Firmware maintains a copy of the
rmap which is used by the hardware for determining which PE to use
when handling a DMA so this gets us a bit closer to the model used
by the HW, which is comprensible by mortals, rather than... whatever
the hell is going on currently.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8525642b1256..d111a50fbe68 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -672,13 +672,9 @@ struct pnv_ioda_pe *__pnv_ioda_get_pe(struct pnv_phb *phb, 
u16 bdfn)
 struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 {
struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
-   struct pci_dn *pdn = pci_get_pdn(dev);
+   u16 bdfn = (dev->bus->number << 8) | dev->devfn;
 
-   if (!pdn)
-   return NULL;
-   if (pdn->pe_number == IODA_INVALID_PE)
-   return NULL;
-   return &phb->ioda.pe_array[pdn->pe_number];
+   return __pnv_ioda_get_pe(phb, bdfn);
 }
 
 static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
-- 
2.21.0



[Very RFC 38/46] powerpc/pci-hotplug: Scan the whole bus when using PCI_PROBE_NORMAL

2019-11-19 Thread Oliver O'Halloran
Currently when using the normal (i.e not building pci_dev's from the DT
node) probe method we only scan the devfn corresponding to the first child
of the bridge's DT node. This doesn't make much sense to me, but it seems
to have worked so far. At a guess it seems to work because in a PCIe
environment the first downstream child will be at devfn 00.0.

In any case it's completely broken when no pci_dn is available. Remove
the PCI_DN checking and scan each of the device number that might be on
the downstream bus.

Cc: Benjamin Herrenschmidt 
Signed-off-by: Oliver O'Halloran 
---
I'm not sure we should be using pci_scan_slot() directly here. Maybe
there's some insane legacy reason for it.
---
 arch/powerpc/kernel/pci-hotplug.c | 15 ---
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/pci-hotplug.c 
b/arch/powerpc/kernel/pci-hotplug.c
index d6a67f814983..85299c769768 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -123,17 +123,10 @@ void pci_hp_add_devices(struct pci_bus *bus)
if (mode == PCI_PROBE_DEVTREE) {
/* use ofdt-based probe */
of_rescan_bus(dn, bus);
-   } else if (mode == PCI_PROBE_NORMAL &&
-  dn->child && PCI_DN(dn->child)) {
-   /*
-* Use legacy probe. In the partial hotplug case, we
-* probably have grandchildren devices unplugged. So
-* we don't check the return value from pci_scan_slot() in
-* order for fully rescan all the way down to pick them up.
-* They can have been removed during partial hotplug.
-*/
-   slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
-   pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
+   } else if (mode == PCI_PROBE_NORMAL) {
+   for (slotno = 0; slotno < 255; slotno += 8)
+   pci_scan_slot(bus, slotno);
+
max = bus->busn_res.start;
/*
 * Scan bridges that are already configured. We don't touch
-- 
2.21.0



[Very RFC 39/46] powernv/npu: Avoid pci_dn when mapping device_node to a pci_dev

2019-11-19 Thread Oliver O'Halloran
There's no need to use the pci_dn to find a device_node from a pci_dev.
Just search for the node pointed to by the pci_dev's of_node pointer.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 68bfaef44862..72d3749da02c 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -21,11 +21,11 @@
 
 static struct pci_dev *get_pci_dev(struct device_node *dn)
 {
-   struct pci_dn *pdn = PCI_DN(dn);
-   struct pci_dev *pdev;
+   struct pci_dev *pdev = NULL;
 
-   pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
-  pdn->busno, pdn->devfn);
+   for_each_pci_dev(pdev)
+   if (pdev->dev.of_node == dn)
+   break;
 
/*
 * pci_get_domain_bus_and_slot() increased the reference count of
-- 
2.21.0



[Very RFC 40/46] powernv/npu: Don't drop refcount when looking up GPU pci_devs

2019-11-19 Thread Oliver O'Halloran
The comment here implies that we don't need to take a ref to the pci_dev
because the ioda_pe will always have one. This implies that the current
expection is that the pci_dev for an NPU device will *never* be torn
down since the ioda_pe having a ref to the device will prevent the
release function from being called.

In other words, the desired behaviour here appears to be leaking a ref.

Nice!

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 72d3749da02c..2eb6e6d45a98 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -28,15 +28,10 @@ static struct pci_dev *get_pci_dev(struct device_node *dn)
break;
 
/*
-* pci_get_domain_bus_and_slot() increased the reference count of
-* the PCI device, but callers don't need that actually as the PE
-* already holds a reference to the device. Since callers aren't
-* aware of the reference count change, call pci_dev_put() now to
-* avoid leaks.
+* NB: for_each_pci_dev() elevates the pci_dev refcount.
+* Caller is responsible for dropping the ref when it's
+* finished with it.
 */
-   if (pdev)
-   pci_dev_put(pdev);
-
return pdev;
 }
 
-- 
2.21.0



[Very RFC 41/46] powernv/eeh: Remove pdn setup for SR-IOV VFs

2019-11-19 Thread Oliver O'Halloran
We don't need a pci_dn for the VF any more, so we can skip adding them.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 16 
 1 file changed, 16 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index d111a50fbe68..d3e375d71cdc 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1526,7 +1526,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
u16 num_vfs)
for (vf_index = 0; vf_index < num_vfs; vf_index++) {
int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
-   struct pci_dn *vf_pdn;
 
if (iov->m64_single_mode)
pe_num = iov->pe_num_map[vf_index];
@@ -1558,15 +1557,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
u16 num_vfs)
list_add_tail(&pe->list, &phb->ioda.pe_list);
mutex_unlock(&phb->ioda.pe_list_mutex);
 
-   /* associate this pe to it's pdn */
-   list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
-   if (vf_pdn->busno == vf_bus &&
-   vf_pdn->devfn == vf_devfn) {
-   vf_pdn->pe_number = pe_num;
-   break;
-   }
-   }
-
pnv_pci_ioda2_setup_dma_pe(phb, pe);
 #ifdef CONFIG_IOMMU_API
iommu_register_group(&pe->table_group,
@@ -1688,17 +1678,11 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 
num_vfs)
 int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
 {
pnv_pci_sriov_disable(pdev);
-
-   /* Release PCI data */
-   remove_sriov_vf_pdns(pdev);
return 0;
 }
 
 int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 {
-   /* Allocate PCI data */
-   add_sriov_vf_pdns(pdev);
-
return pnv_pci_sriov_enable(pdev, num_vfs);
 }
 #endif /* CONFIG_PCI_IOV */
-- 
2.21.0



[Very RFC 42/46] powernv/pci: Don't clear pdn->pe_number in pnv_pci_release_device

2019-11-19 Thread Oliver O'Halloran
Nothing looks at it anymore.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index d3e375d71cdc..45d940730c30 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3541,9 +3541,7 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
 
 static void pnv_pci_release_device(struct pci_dev *pdev)
 {
-   struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
-   struct pci_dn *pdn = pci_get_pdn(pdev);
 
/* The VF PE state is torn down when sriov_disable() is called */
if (pdev->is_virtfn)
@@ -3560,16 +3558,6 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
if (pdev->is_physfn)
kfree(pdev->dev.archdata.iov_data);
 
-   /*
-* PCI hotplug can happen as part of EEH error recovery. The @pdn
-* isn't removed and added afterwards in this scenario. We should
-* set the PE number in @pdn to an invalid one. Otherwise, the PE's
-* device count is decreased on removing devices while failing to
-* be increased on adding devices. It leads to unbalanced PE's device
-* count and eventually make normal PCI hotplug path broken.
-*/
-   pdn->pe_number = IODA_INVALID_PE;
-
WARN_ON(--pe->device_count < 0);
if (pe->device_count == 0)
pnv_ioda_release_pe(pe);
-- 
2.21.0



[Very RFC 43/46] powernv/pci: Do not set pdn->pe_number for NPU/CAPI devices

2019-11-19 Thread Oliver O'Halloran
The only thing we need the pdn for in this function is setting the pe_number
field, which we don't use anymore. Fix the weird refcounting behaviour while
we're here.

Signed-off-by: Oliver O'Halloran 
---
Either Fred, or Reza also fixed this in some patch lately and that'll probably 
get
merged before this one does.
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 27 +--
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 45d940730c30..2a9201306543 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1066,16 +1066,13 @@ static int pnv_pci_vf_resource_shift(struct pci_dev 
*dev, int offset)
 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 {
struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
-   struct pci_dn *pdn = pci_get_pdn(dev);
-   struct pnv_ioda_pe *pe;
+   struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
 
-   if (!pdn) {
-   pr_err("%s: Device tree node not associated properly\n",
-  pci_name(dev));
+   /* Already has a PE assigned? huh? */
+   if (pe) {
+   WARN_ON(1);
return NULL;
}
-   if (pdn->pe_number != IODA_INVALID_PE)
-   return NULL;
 
pe = pnv_ioda_alloc_pe(phb);
if (!pe) {
@@ -1084,29 +1081,25 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct 
pci_dev *dev)
return NULL;
}
 
-   /* NOTE: We get only one ref to the pci_dev for the pdn, not for the
-* pointer in the PE data structure, both should be destroyed at the
-* same time. However, this needs to be looked at more closely again
-* once we actually start removing things (Hotplug, SR-IOV, ...)
+   /*
+* NB: We **do not** hold a pci_dev ref for pe->pdev.
 *
-* At some point we want to remove the PDN completely anyways
+* The pci_dev's release function cleans up the ioda_pe state, so:
+*  a) We can't take a ref otherwise the release function is never 
called
+*  b) The pe->pdev pointer will always point to valid pci_dev (or NULL)
 */
-   pci_dev_get(dev);
-   pdn->pe_number = pe->pe_number;
pe->flags = PNV_IODA_PE_DEV;
pe->pdev = dev;
pe->pbus = NULL;
pe->mve_number = -1;
-   pe->rid = dev->bus->number << 8 | pdn->devfn;
+   pe->rid = dev->bus->number << 8 | dev->devfn;
 
pe_info(pe, "Associated device to PE\n");
 
if (pnv_ioda_configure_pe(phb, pe)) {
/* XXX What do we do here ? */
pnv_ioda_free_pe(pe);
-   pdn->pe_number = IODA_INVALID_PE;
pe->pdev = NULL;
-   pci_dev_put(dev);
return NULL;
}
 
-- 
2.21.0



  1   2   3   4   5   6   7   >