[PATCH] powerpc/pseries: hwpoison the pages upon hitting UE

2019-04-15 Thread Ganesh Goudar
Add support to hwpoison the pages upon hitting machine check
exception.

This patch queues the address where UE is hit to percpu array
and schedules work to plumb it into memory poison infrastructure.

Reviewed-by: Mahesh Salgaonkar 
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/mce.h   |  1 +
 arch/powerpc/kernel/mce_power.c  |  2 +-
 arch/powerpc/platforms/pseries/ras.c | 85 +++-
 3 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 17996bc9382b..ad47fa865324 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -210,6 +210,7 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 6b800eec31f2..367fbfa2e835 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -36,7 +36,7 @@
  * Convert an address related to an mm to a PFN. NOTE: we are in real
  * mode, we could potentially race with page table updates.
  */
-static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
 {
pte_t *ptep;
unsigned long flags;
diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 452dcfd7e5dd..d689b5fa6354 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -41,6 +41,18 @@ static struct irq_work mce_errlog_process_work = {
.func = mce_process_errlog_event,
 };
 
+#ifdef CONFIG_MEMORY_FAILURE
+
+static void pseries_hwpoison_work_fn(struct work_struct *work);
+static DEFINE_PER_CPU(int, rtas_ue_count);
+static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]);
+static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn);
+
+#define UE_EFFECTIVE_ADDR_PROVIDED PPC_BIT8(1)
+#define UE_LOGICAL_ADDR_PROVIDED   PPC_BIT8(2)
+
+#endif /* CONFIG_MEMORY_FAILURE */
+
 #define EPOW_SENSOR_TOKEN  9
 #define EPOW_SENSOR_INDEX  0
 
@@ -707,6 +719,75 @@ static int mce_handle_error(struct rtas_error_log *errp)
return disposition;
 }
 
+#ifdef CONFIG_MEMORY_FAILURE
+
+static void pseries_hwpoison_work_fn(struct work_struct *work)
+{
+   unsigned long paddr;
+   int index;
+
+   while (__this_cpu_read(rtas_ue_count) > 0) {
+   index = __this_cpu_read(rtas_ue_count) - 1;
+   paddr = __this_cpu_read(rtas_ue_paddr[index]);
+   memory_failure(paddr >> PAGE_SHIFT, 0);
+   __this_cpu_dec(rtas_ue_count);
+   }
+}
+
+static void queue_ue_paddr(unsigned long paddr)
+{
+   int index;
+
+   index = __this_cpu_inc_return(rtas_ue_count) - 1;
+   if (index >= MAX_MC_EVT) {
+   __this_cpu_dec(rtas_ue_count);
+   return;
+   }
+   this_cpu_write(rtas_ue_paddr[index], paddr);
+   schedule_work(&hwpoison_work);
+}
+
+static void pseries_do_memory_failure(struct pt_regs *regs,
+ struct pseries_mc_errorlog *mce_log)
+{
+   unsigned long paddr;
+
+   if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
+   paddr = be64_to_cpu(mce_log->logical_address);
+   } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
+   unsigned long pfn;
+
+   pfn = addr_to_pfn(regs,
+ be64_to_cpu(mce_log->effective_address));
+   if (pfn == ULONG_MAX)
+   return;
+   paddr = pfn << PAGE_SHIFT;
+   } else {
+   return;
+   }
+   queue_ue_paddr(paddr);
+}
+
+static void pseries_process_ue(struct pt_regs *regs,
+  struct rtas_error_log *errp)
+{
+   struct pseries_errorlog *pseries_log;
+   struct pseries_mc_errorlog *mce_log;
+
+   if (!rtas_error_extended(errp))
+   return;
+
+   pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+   if (!pseries_log)
+   return;
+
+   mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+
+   if (mce_log->error_type == MC_ERROR_TYPE_UE)
+   pseries_do_memory_failure(regs, mce_log);
+}
+#endif /*CONFIG_MEMORY_FAILURE */
+
 /*
  * Process MCE rtas errlog event.
  */
@@ -764,7 +845,9 @@ static int recover_mce(struct pt_regs *regs, struct 
rtas_error_log *err)
_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
recov

[PATCH] powerpc/pseries: Fix MCE handling on pseries

2020-03-13 Thread Ganesh Goudar
MCE handling on pSeries platform fails as recent rework to use common
code for pSeries and PowerNV in machine check error handling tries to
access per-cpu variables in realmode. The per-cpu variables may be
outside the RMO region on pSeries platform and needs translation to be
enabled for access. Just moving these per-cpu variable into RMO region
did'nt help because we queue some work to workqueues in real mode, which
again tries to touch per-cpu variables. Also fwnmi_release_errinfo()
cannot be called when translation is not enabled.

This patch fixes this by enabling translation in the exception handler
when all required real mode handling is done. This change only affects
the pSeries platform.

Without this fix below kernel crash is seen on injecting
SLB multihit:

BUG: Unable to handle kernel data access on read at 0xc0027b205950
Faulting instruction address: 0xc003b7e0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in: mcetest_slb(OE+) af_packet(E) xt_tcpudp(E) ip6t_rpfilter(E) 
ip6t_REJECT(E) ipt_REJECT(E) xt_conntrack(E) ip_set(E) nfnetlink(E) 
ebtable_nat(E) ebtable_broute(E) ip6table_nat(E) ip6table_mangle(E) 
ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat(E) nf_conntrack(E) 
nf_defrag_ipv6(E) nf_defrag_ipv4(E) iptable_mangle(E) iptable_raw(E) 
iptable_security(E) ebtable_filter(E) ebtables(E) ip6table_filter(E) 
ip6_tables(E) iptable_filter(E) ip_tables(E) x_tables(E) xfs(E) ibmveth(E) 
vmx_crypto(E) gf128mul(E) uio_pdrv_genirq(E) uio(E) crct10dif_vpmsum(E) 
rtc_generic(E) btrfs(E) libcrc32c(E) xor(E) zstd_decompress(E) zstd_compress(E) 
raid6_pq(E) sr_mod(E) sd_mod(E) cdrom(E) ibmvscsi(E) scsi_transport_srp(E) 
crc32c_vpmsum(E) dm_mod(E) sg(E) scsi_mod(E)
CPU: 34 PID: 8154 Comm: insmod Kdump: loaded Tainted: G OE 5.5.0-mahesh #1
NIP: c003b7e0 LR: c00f2218 CTR: 
REGS: c7dcb960 TRAP: 0300 Tainted: G OE (5.5.0-mahesh)
MSR: 80001003  CR: 28002428 XER: 2004
CFAR: c00f2214 DAR: c0027b205950 DSISR: 4000 IRQMASK: 0
GPR00: c00f2218 c7dcbbf0 c1544800 c7dcbd70
GPR04: 0001 c7dcbc98 c00800d00258 c008011c
GPR08:  00030003 c1035950 0348
GPR12: 00027a1d c7f9c000 0558 
GPR16: 0540 c0080111 c00801110540 
GPR20: c022af10 c0025480fd70 c0080128 c0004bfbb300
GPR24: c1442330 c008080d c0080800 4009287a77000510
GPR28:  0002 c1033d30 0001
NIP [c003b7e0] save_mce_event+0x30/0x240
LR [c00f2218] pseries_machine_check_realmode+0x2c8/0x4f0
Call Trace:
Instruction dump:
3c4c0151 38429050 7c0802a6 6000 fbc1fff0 fbe1fff8 f821ffd1 3d42ffaf
3fc2ffaf e98d0030 394a1150 3bdef530 <7d6a62aa> 1d2b0048 2f8b0063 380b0001
---[ end trace 46fd63f36bbdd940 ]---

Fixes: 9ca766f9891d ("powerpc/64s/pseries: machine check convert to use common 
event code")
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/exceptions-64s.S | 12 
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c | 11 +++
 3 files changed, 24 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index ffc15f4f079d..93d0fddd8074 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -2405,3 +2405,15 @@ replay_interrupt_return:
blr
 
 _ASM_NOKPROBE_SYMBOL(__replay_interrupt)
+
+_GLOBAL(switch_to_virt_mode)
+   mflrr10
+   mfmsr   r11
+   ori r11,r11,MSR_IR|MSR_DR
+   /* Clear MSR_RI before setting SRR0 and SRR1 */
+   li  r12,0
+   mtmsrd  r12,1
+   mtspr   SPRN_SRR0,r10
+   mtspr   SPRN_SRR1,r11
+   RFI_TO_KERNEL
+   b   .
diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index 13fa370a87e4..768e3e0f137d 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -113,5 +113,6 @@ int dlpar_workqueue_init(void);
 
 void pseries_setup_rfi_flush(void);
 void pseries_lpar_read_hblkrm_characteristics(void);
+extern void switch_to_virt_mode(void);
 
 #endif /* _PSERIES_PSERIES_H */
diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 1d7f973c647b..5d49d9d711da 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -683,6 +683,17 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
 #endif
 
 out:
+   /*
+* Enable translation as we will be accessing per-cpu variables
+* in save_mce_event() which may fall outside RMO region, also
+* leave it enabled because

[PATCH] powerpc/pseries: Handle UE event for memcpy_mcsafe

2020-03-13 Thread Ganesh Goudar
If we hit UE at an instruction with a fixup entry, flag to
ignore the event and set nip to continue execution at the
fixup entry.
For powernv this changes are already made by commit
895e3dceeb97 ("powerpc/mce: Handle UE event for memcpy_mcsafe")

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 5d49d9d711da..6dc3074a34c5 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -505,6 +506,7 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
int initiator = rtas_error_initiator(errp);
int severity = rtas_error_severity(errp);
u8 error_type, err_sub_type;
+   const struct exception_table_entry *entry;
 
if (initiator == RTAS_INITIATOR_UNKNOWN)
mce_err.initiator = MCE_INITIATOR_UNKNOWN;
@@ -558,6 +560,12 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
switch (mce_log->error_type) {
case MC_ERROR_TYPE_UE:
mce_err.error_type = MCE_ERROR_TYPE_UE;
+   entry = search_kernel_exception_table(regs->nip);
+   if (entry) {
+   mce_err.ignore_event = true;
+   regs->nip = extable_fixup(entry);
+   disposition = RTAS_DISP_FULLY_RECOVERED;
+   }
switch (err_sub_type) {
case MC_ERROR_UE_IFETCH:
mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH;
-- 
2.17.2



[PATCH v2] powerpc/pseries: Fix MCE handling on pseries

2020-03-20 Thread Ganesh Goudar
MCE handling on pSeries platform fails as recent rework to use common
code for pSeries and PowerNV in machine check error handling tries to
access per-cpu variables in realmode. The per-cpu variables may be
outside the RMO region on pSeries platform and needs translation to be
enabled for access. Just moving these per-cpu variable into RMO region
did'nt help because we queue some work to workqueues in real mode, which
again tries to touch per-cpu variables. Also fwnmi_release_errinfo()
cannot be called when translation is not enabled.

This patch fixes this by enabling translation in the exception handler
when all required real mode handling is done. This change only affects
the pSeries platform.

Without this fix below kernel crash is seen on injecting
SLB multihit:

BUG: Unable to handle kernel data access on read at 0xc0027b205950
Faulting instruction address: 0xc003b7e0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in: mcetest_slb(OE+) af_packet(E) xt_tcpudp(E) ip6t_rpfilter(E) 
ip6t_REJECT(E) ipt_REJECT(E) xt_conntrack(E) ip_set(E) nfnetlink(E) 
ebtable_nat(E) ebtable_broute(E) ip6table_nat(E) ip6table_mangle(E) 
ip6table_raw(E) ip6table_security(E) iptable_nat(E) nf_nat(E) nf_conntrack(E) 
nf_defrag_ipv6(E) nf_defrag_ipv4(E) iptable_mangle(E) iptable_raw(E) 
iptable_security(E) ebtable_filter(E) ebtables(E) ip6table_filter(E) 
ip6_tables(E) iptable_filter(E) ip_tables(E) x_tables(E) xfs(E) ibmveth(E) 
vmx_crypto(E) gf128mul(E) uio_pdrv_genirq(E) uio(E) crct10dif_vpmsum(E) 
rtc_generic(E) btrfs(E) libcrc32c(E) xor(E) zstd_decompress(E) zstd_compress(E) 
raid6_pq(E) sr_mod(E) sd_mod(E) cdrom(E) ibmvscsi(E) scsi_transport_srp(E) 
crc32c_vpmsum(E) dm_mod(E) sg(E) scsi_mod(E)
CPU: 34 PID: 8154 Comm: insmod Kdump: loaded Tainted: G OE 5.5.0-mahesh #1
NIP: c003b7e0 LR: c00f2218 CTR: 
REGS: c7dcb960 TRAP: 0300 Tainted: G OE (5.5.0-mahesh)
MSR: 80001003  CR: 28002428 XER: 2004
CFAR: c00f2214 DAR: c0027b205950 DSISR: 4000 IRQMASK: 0
GPR00: c00f2218 c7dcbbf0 c1544800 c7dcbd70
GPR04: 0001 c7dcbc98 c00800d00258 c008011c
GPR08:  00030003 c1035950 0348
GPR12: 00027a1d c7f9c000 0558 
GPR16: 0540 c0080111 c00801110540 
GPR20: c022af10 c0025480fd70 c0080128 c0004bfbb300
GPR24: c1442330 c008080d c0080800 4009287a77000510
GPR28:  0002 c1033d30 0001
NIP [c003b7e0] save_mce_event+0x30/0x240
LR [c00f2218] pseries_machine_check_realmode+0x2c8/0x4f0
Call Trace:
Instruction dump:
3c4c0151 38429050 7c0802a6 6000 fbc1fff0 fbe1fff8 f821ffd1 3d42ffaf
3fc2ffaf e98d0030 394a1150 3bdef530 <7d6a62aa> 1d2b0048 2f8b0063 380b0001
---[ end trace 46fd63f36bbdd940 ]---

Fixes: 9ca766f9891d ("powerpc/64s/pseries: machine check convert to use common 
event code")
Reviewed-by: Mahesh Salgaonkar 
Reviewed-by: Nicholas Piggin 
Signed-off-by: Ganesh Goudar 
---
v2: Avoid asm code to switch to virtual mode.
---
 arch/powerpc/platforms/pseries/ras.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 1d7f973c647b..43710b69e09e 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -683,6 +683,17 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
 #endif
 
 out:
+   /*
+* Enable translation as we will be accessing per-cpu variables
+* in save_mce_event() which may fall outside RMO region, also
+* leave it enabled because subsequently we will be queuing work
+* to workqueues where again per-cpu variables accessed, besides
+* fwnmi_release_errinfo() crashes when called in realmode on
+* pseries.
+* Note: All the realmode handling like flushing SLB entries for
+*   SLB multihit is done by now.
+*/
+   mtmsr(mfmsr() | MSR_IR | MSR_DR);
save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
&mce_err, regs->nip, eaddr, paddr);
 
-- 
2.17.2



[ltcras] powerpc/pseries: Handle UE event for memcpy_mcsafe

2020-03-20 Thread Ganesh Goudar
If we hit UE at an instruction with a fixup entry, flag to
ignore the event and set nip to continue execution at the
fixup entry.
For powernv these changes are already made by
commit 895e3dceeb97 ("powerpc/mce: Handle UE event for memcpy_mcsafe")

Reviewed-by: Mahesh Salgaonkar 
Reviewed-by: Santosh S 
Signed-off-by: Ganesh Goudar 
---
V2: Fixes a trivial checkpatch error in commit msg
---
 arch/powerpc/platforms/pseries/ras.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 5d49d9d711da..6dc3074a34c5 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -505,6 +506,7 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
int initiator = rtas_error_initiator(errp);
int severity = rtas_error_severity(errp);
u8 error_type, err_sub_type;
+   const struct exception_table_entry *entry;
 
if (initiator == RTAS_INITIATOR_UNKNOWN)
mce_err.initiator = MCE_INITIATOR_UNKNOWN;
@@ -558,6 +560,12 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
switch (mce_log->error_type) {
case MC_ERROR_TYPE_UE:
mce_err.error_type = MCE_ERROR_TYPE_UE;
+   entry = search_kernel_exception_table(regs->nip);
+   if (entry) {
+   mce_err.ignore_event = true;
+   regs->nip = extable_fixup(entry);
+   disposition = RTAS_DISP_FULLY_RECOVERED;
+   }
switch (err_sub_type) {
case MC_ERROR_UE_IFETCH:
mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH;
-- 
2.17.2



[PATCH v3] powerpc/pseries: Handle UE event for memcpy_mcsafe

2020-03-22 Thread Ganesh Goudar
If we hit UE at an instruction with a fixup entry, flag to
ignore the event and set nip to continue execution at the
fixup entry.
For powernv these changes are already made by
commit 895e3dceeb97 ("powerpc/mce: Handle UE event for memcpy_mcsafe")

Reviewed-by: Mahesh Salgaonkar 
Reviewed-by: Santosh S 
Signed-off-by: Ganesh Goudar 
---
V2: Fixes a trivial checkpatch error in commit msg.
V3: Use proper subject prefix.
---
 arch/powerpc/platforms/pseries/ras.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 43710b69e09e..58e2483fbb1a 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -505,6 +506,7 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
int initiator = rtas_error_initiator(errp);
int severity = rtas_error_severity(errp);
u8 error_type, err_sub_type;
+   const struct exception_table_entry *entry;
 
if (initiator == RTAS_INITIATOR_UNKNOWN)
mce_err.initiator = MCE_INITIATOR_UNKNOWN;
@@ -558,6 +560,12 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
switch (mce_log->error_type) {
case MC_ERROR_TYPE_UE:
mce_err.error_type = MCE_ERROR_TYPE_UE;
+   entry = search_kernel_exception_table(regs->nip);
+   if (entry) {
+   mce_err.ignore_event = true;
+   regs->nip = extable_fixup(entry);
+   disposition = RTAS_DISP_FULLY_RECOVERED;
+   }
switch (err_sub_type) {
case MC_ERROR_UE_IFETCH:
mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH;
-- 
2.17.2



[PATCH v4] powerpc/pseries: Handle UE event for memcpy_mcsafe

2020-03-26 Thread Ganesh Goudar
memcpy_mcsafe has been implemented for power machines which is used
by pmem infrastructure, so that an UE encountered during memcpy from
pmem devices would not result in panic instead a right error code
is returned. The implementation expects machine check handler to ignore
the event and set nip to continue the execution from fixup code.

Appropriate changes are already made to powernv machine check handler,
make similar changes to pseries machine check handler to ignore the
the event and set nip to continue execution at the fixup entry if we
hit UE at an instruction with a fixup entry.

while we are at it, have a common function which searches the exception
table entry and updates nip with fixup address, and any future common
changes can be made in this function that are valid for both architectures.

powernv changes are made by
commit 895e3dceeb97 ("powerpc/mce: Handle UE event for memcpy_mcsafe")

Reviewed-by: Mahesh Salgaonkar 
Reviewed-by: Santosh S 
Signed-off-by: Ganesh Goudar 
---
V2: Fixes a trivial checkpatch error in commit msg.
V3: Use proper subject prefix.
V4: Rephrase the commit message.
Define a common function to update nip with fixup address.
---
 arch/powerpc/include/asm/mce.h   |  2 ++
 arch/powerpc/kernel/mce.c| 14 ++
 arch/powerpc/kernel/mce_power.c  |  8 ++--
 arch/powerpc/platforms/pseries/ras.c |  3 +++
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 6a6ddaabdb34..376a395daf32 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -218,6 +218,8 @@ extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+extern void mce_common_process_ue(struct pt_regs *regs,
+ struct mce_error_info *mce_err);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 34c1001e9e8b..8077b5fb18a7 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -251,6 +252,19 @@ void machine_check_queue_event(void)
/* Queue irq work to process this event later. */
irq_work_queue(&mce_event_process_work);
 }
+
+void mce_common_process_ue(struct pt_regs *regs,
+  struct mce_error_info *mce_err)
+{
+   const struct exception_table_entry *entry;
+
+   entry = search_kernel_exception_table(regs->nip);
+   if (entry) {
+   mce_err->ignore_event = true;
+   regs->nip = extable_fixup(entry);
+   }
+}
+
 /*
  * process pending MCE event from the mce event queue. This function will be
  * called during syscall exit.
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 1cbf7f1a4e3d..067b094bfeff 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -579,14 +579,10 @@ static long mce_handle_ue_error(struct pt_regs *regs,
struct mce_error_info *mce_err)
 {
long handled = 0;
-   const struct exception_table_entry *entry;
 
-   entry = search_kernel_exception_table(regs->nip);
-   if (entry) {
-   mce_err->ignore_event = true;
-   regs->nip = extable_fixup(entry);
+   mce_common_process_ue(regs, mce_err);
+   if (mce_err->ignore_event)
return 1;
-   }
 
/*
 * On specific SCOM read via MMIO we may get a machine check
diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 43710b69e09e..1d1da639b8b7 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -558,6 +558,9 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
switch (mce_log->error_type) {
case MC_ERROR_TYPE_UE:
mce_err.error_type = MCE_ERROR_TYPE_UE;
+   mce_common_process_ue(regs, &mce_err);
+   if (mce_err.ignore_event)
+   disposition = RTAS_DISP_FULLY_RECOVERED;
switch (err_sub_type) {
case MC_ERROR_UE_IFETCH:
mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH;
-- 
2.17.2



[PATCH] powerpc/mce: Add MCE notification chain

2020-03-30 Thread Ganesh Goudar
From: Santosh S 

Introduce notification chain which lets know about uncorrected memory
errors(UE). This would help prospective users in pmem or nvdimm subsystem
to track bad blocks for better handling of persistent memory allocations.

Signed-off-by: Santosh S 
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/mce.h |  2 ++
 arch/powerpc/kernel/mce.c  | 15 +++
 2 files changed, 17 insertions(+)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 6a6ddaabdb34..6e222a94a68a 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -218,6 +218,8 @@ extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+int mce_register_notifier(struct notifier_block *nb);
+int mce_unregister_notifier(struct notifier_block *nb);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 34c1001e9e8b..f50d7f56c02c 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -47,6 +47,20 @@ static struct irq_work mce_ue_event_irq_work = {
 
 DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
+static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
+
+int mce_register_notifier(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_register(&mce_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_notifier);
+
+int mce_unregister_notifier(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_unregister(&mce_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_notifier);
+
 static void mce_set_error_info(struct machine_check_event *mce,
   struct mce_error_info *mce_err)
 {
@@ -263,6 +277,7 @@ static void machine_process_ue_event(struct work_struct 
*work)
while (__this_cpu_read(mce_ue_count) > 0) {
index = __this_cpu_read(mce_ue_count) - 1;
evt = this_cpu_ptr(&mce_ue_event_queue[index]);
+   blocking_notifier_call_chain(&mce_notifier_list, 0, evt);
 #ifdef CONFIG_MEMORY_FAILURE
/*
 * This should probably queued elsewhere, but
-- 
2.17.2



[PATCH 1/2] powerpc/mce: Add helper functions to remove duplicate code

2020-04-26 Thread Ganesh Goudar
mce_handle_ierror() and mce_handle_derror() has some duplicate
code to recover from the recoverable MCE errors and to get the
MCE error sub-type while generating MCE error info, Add helper
functions to remove it.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce_power.c | 136 +---
 1 file changed, 56 insertions(+), 80 deletions(-)

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 067b094bfeff..143e79450e93 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -391,6 +391,56 @@ static int mce_find_instr_ea_and_phys(struct pt_regs 
*regs, uint64_t *addr,
return -1;
 }
 
+static int mce_correct_err(unsigned int err_type)
+{
+   int handled = 0;
+
+   /* attempt to correct the error */
+   switch (err_type) {
+   case MCE_ERROR_TYPE_SLB:
+   if (local_paca->in_mce == 1)
+   slb_save_contents(local_paca->mce_faulty_slbs);
+   handled = mce_flush(MCE_FLUSH_SLB);
+   break;
+   case MCE_ERROR_TYPE_ERAT:
+   handled = mce_flush(MCE_FLUSH_ERAT);
+   break;
+   case MCE_ERROR_TYPE_TLB:
+   handled = mce_flush(MCE_FLUSH_TLB);
+   break;
+   }
+   return handled;
+}
+
+static void mce_err_get_sub_type(struct mce_error_info *mce_err,
+unsigned int err_type,
+unsigned int err_sub_type)
+{
+   switch (err_type) {
+   case MCE_ERROR_TYPE_UE:
+   mce_err->u.ue_error_type = err_sub_type;
+   break;
+   case MCE_ERROR_TYPE_SLB:
+   mce_err->u.slb_error_type = err_sub_type;
+   break;
+   case MCE_ERROR_TYPE_ERAT:
+   mce_err->u.erat_error_type = err_sub_type;
+   break;
+   case MCE_ERROR_TYPE_TLB:
+   mce_err->u.tlb_error_type = err_sub_type;
+   break;
+   case MCE_ERROR_TYPE_USER:
+   mce_err->u.user_error_type = err_sub_type;
+   break;
+   case MCE_ERROR_TYPE_RA:
+   mce_err->u.ra_error_type = err_sub_type;
+   break;
+   case MCE_ERROR_TYPE_LINK:
+   mce_err->u.link_error_type = err_sub_type;
+   break;
+   }
+}
+
 static int mce_handle_ierror(struct pt_regs *regs,
const struct mce_ierror_table table[],
struct mce_error_info *mce_err, uint64_t *addr,
@@ -405,48 +455,13 @@ static int mce_handle_ierror(struct pt_regs *regs,
for (i = 0; table[i].srr1_mask; i++) {
if ((srr1 & table[i].srr1_mask) != table[i].srr1_value)
continue;
-
-   /* attempt to correct the error */
-   switch (table[i].error_type) {
-   case MCE_ERROR_TYPE_SLB:
-   if (local_paca->in_mce == 1)
-   slb_save_contents(local_paca->mce_faulty_slbs);
-   handled = mce_flush(MCE_FLUSH_SLB);
-   break;
-   case MCE_ERROR_TYPE_ERAT:
-   handled = mce_flush(MCE_FLUSH_ERAT);
-   break;
-   case MCE_ERROR_TYPE_TLB:
-   handled = mce_flush(MCE_FLUSH_TLB);
-   break;
-   }
+   handled = mce_correct_err(table[i].error_type);
 
/* now fill in mce_error_info */
mce_err->error_type = table[i].error_type;
mce_err->error_class = table[i].error_class;
-   switch (table[i].error_type) {
-   case MCE_ERROR_TYPE_UE:
-   mce_err->u.ue_error_type = table[i].error_subtype;
-   break;
-   case MCE_ERROR_TYPE_SLB:
-   mce_err->u.slb_error_type = table[i].error_subtype;
-   break;
-   case MCE_ERROR_TYPE_ERAT:
-   mce_err->u.erat_error_type = table[i].error_subtype;
-   break;
-   case MCE_ERROR_TYPE_TLB:
-   mce_err->u.tlb_error_type = table[i].error_subtype;
-   break;
-   case MCE_ERROR_TYPE_USER:
-   mce_err->u.user_error_type = table[i].error_subtype;
-   break;
-   case MCE_ERROR_TYPE_RA:
-   mce_err->u.ra_error_type = table[i].error_subtype;
-   break;
-   case MCE_ERROR_TYPE_LINK:
-   mce_err->u.link_error_type = table[i].error_subtype;
-   break;
-   }
+   mce_err_get_sub_type(mce_err, table[i].error_type,
+table[i].error_subtype);
mce_err->sync_error = table[i

[PATCH 2/2] powerpc/mce: Do not poison the memory using guest effective addr

2020-04-26 Thread Ganesh Goudar
As of now, if we hit UE due to memory failure in guest, host MCE
handler tries to find the pfn or physical address where the memory
error occurred using guest effective address and uses that pfn to
poison the memory, which is not right.

If we hit UE in guest, do not try to find pfn in host and thereby
avoid poisoning the memory in host.

Reviewed-by: Mahesh Salgaonkar 
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce_power.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 143e79450e93..4e541dd3af0d 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -471,6 +471,10 @@ static int mce_handle_ierror(struct pt_regs *regs,
table[i].error_type == MCE_ERROR_TYPE_UE) {
unsigned long pfn;
 
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
+   if (get_paca()->kvm_hstate.in_guest)
+   return handled;
+#endif
if (get_paca()->in_mce < MAX_MCE_DEPTH) {
pfn = addr_to_pfn(regs, regs->nip);
if (pfn != ULONG_MAX) {
@@ -515,6 +519,7 @@ static int mce_handle_derror(struct pt_regs *regs,
 */
if (found)
continue;
+   found = 1;
 
/* now fill in mce_error_info */
mce_err->error_type = table[i].error_type;
@@ -528,6 +533,10 @@ static int mce_handle_derror(struct pt_regs *regs,
*addr = regs->dar;
else if (mce_err->sync_error &&
table[i].error_type == MCE_ERROR_TYPE_UE) {
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
+   if (get_paca()->kvm_hstate.in_guest)
+   continue;
+#endif
/*
 * We do a maximum of 4 nested MCE calls, see
 * kernel/exception-64s.h
@@ -536,7 +545,6 @@ static int mce_handle_derror(struct pt_regs *regs,
mce_find_instr_ea_and_phys(regs, addr,
   phys_addr);
}
-   found = 1;
}
 
if (found)
-- 
2.17.2



[PATCH] powerpc: dump kernel log before carrying out fadump or kdump

2019-08-21 Thread Ganesh Goudar
Die or panic path in system reset handler dumps kernel log to
nvram, since commit 4388c9b3a6ee ("powerpc: Do not send system
reset request through the oops path") system reset request is
not allowed to take die path if fadump or kdump is configured,
hence we miss dumping kernel log to nvram, call kmsg_dump()
before carrying out fadump or kdump.

Fixes: 4388c9b3a6ee ("powerpc: Do not send system reset request through the 
oops path")
Reviewed-by: Mahesh Salgaonkar 
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/traps.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 11caa0291254..82f43535e686 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -472,6 +472,7 @@ void system_reset_exception(struct pt_regs *regs)
if (debugger(regs))
goto out;
 
+   kmsg_dump(KMSG_DUMP_OOPS);
/*
 * A system reset is a request to dump, so we always send
 * it through the crashdump code (if fadump or kdump are
-- 
2.17.2



[PATCH v2] powerpc: dump kernel log before carrying out fadump or kdump

2019-09-04 Thread Ganesh Goudar
Since commit 4388c9b3a6ee ("powerpc: Do not send system reset request
through the oops path"), pstore dmesg file is not updated when dump is
triggered from HMC. This commit modified system reset (sreset) handler
to invoke fadump or kdump (if configured), without pushing dmesg to
pstore. This leaves pstore to have old dmesg data which won't be much
of a help if kdump fails to capture the dump. This patch fixes that by
calling kmsg_dump() before heading to fadump ot kdump.

Fixes: 4388c9b3a6ee ("powerpc: Do not send system reset request through the 
oops path")
Reviewed-by: Mahesh Salgaonkar 
Reviewed-by: Nicholas Piggin 
Signed-off-by: Ganesh Goudar 
---
V2: Rephrasing the commit message
---
 arch/powerpc/kernel/traps.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 11caa0291254..82f43535e686 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -472,6 +472,7 @@ void system_reset_exception(struct pt_regs *regs)
if (debugger(regs))
goto out;
 
+   kmsg_dump(KMSG_DUMP_OOPS);
/*
 * A system reset is a request to dump, so we always send
 * it through the crashdump code (if fadump or kdump are
-- 
2.17.2



[RFC PATCH v2 0/3] Asynchronous EEH recovery

2023-07-24 Thread Ganesh Goudar
Hi,

EEH recovery is currently serialized and these patches shorten
the time taken for EEH recovery by making the recovery to run
in parallel. The original author of these patches is Sam Bobroff,
I have rebased and tested these patches.

On powervm with 64 VFs from same PHB,  I see approximately 48%
reduction in time taken in EEH recovery.

On powernv with 9 network cards, Where 2 cards installed on one
PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs
in total. I see approximately 33% reduction in time taken in EEH
recovery.

These patches were originally posted as separate RFCs by Sam, And
I rebased and posted these patches almost a year back, I stopped
pursuing these patches as I was not able test this on powernv, Due
to the issues in drivers of cards I was testing this on, Which are
now resolved. Since I am re-posting this after long time, Posting
this as a fresh RFC, Please comment.

Thanks.

V2:
 * Since we now have event list per phb, Have per phb event list lock.
 * Appropriate names given to the locks.
 * Remove stale comments (few more to be removed).
 * Initialize event_id to 0 instead of 1.
 * And some cosmetic changes.

Ganesh Goudar (3):
  powerpc/eeh: Synchronization for safety
  powerpc/eeh: Provide a unique ID for each EEH recovery
  powerpc/eeh: Asynchronous recovery

 arch/powerpc/include/asm/eeh.h   |  13 +-
 arch/powerpc/include/asm/eeh_event.h |  10 +-
 arch/powerpc/include/asm/pci-bridge.h|   4 +
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c| 159 +++--
 arch/powerpc/kernel/eeh_driver.c | 580 +++
 arch/powerpc/kernel/eeh_event.c  |  75 ++-
 arch/powerpc/kernel/eeh_pe.c |  34 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  10 +-
 include/linux/mmzone.h   |   2 +-
 15 files changed, 693 insertions(+), 225 deletions(-)

-- 
2.40.1



[RFC PATCH v2 1/3] powerpc/eeh: Synchronization for safety

2023-07-24 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

There is currently little synchronization between EEH error detection
(eeh_dev_check_failure()), EEH error recovery
(eeh_handle_{normal,special}_event()) and the PCI subsystem (device
addition and removal), and so there are race conditions that lead to
crashes (often access to free'd memory or LIST_POISON).

However, a solution must consider:
- EEH error detection can occur in interrupt context, which prevents
the use of a mutex.
- EEH recovery may need to sleep, which prevents the use of a spinlock.
- EEH recovery uses PCI operations that may require the PCI
rescan/remove lock and/or device lock to be held
- PCI operations may hold the rescan/remove and/or device lock when
calling into EEH functions.
- Device driver callbacks may perform arbitrary PCI operations
during recovery, including device removal.

In this patch the existing mutex and spinlock are combined with the
EEH_PE_RECOVERING flag to provide some assurances that are then used
to reduce the race conditions.

The fields to be protected are the ones that provide the structure
of the trees of struct eeh_pe that are held for each PHB: the parent
pointer and child lists and the list of struct eeh_dev, as well as
the pe and pdev pointers within struct eeh_dev.

The existing way of using EEH_PE_RECOVERING is kept and slightly
extended: No struct eeh_pe will be removed while it has the flag set
on it. Additionally, when adding new PEs, they are marked
EEH_PE_RECOVERING if their parent PE is marked: this allows the
recovery thread to assume that all PEs underneath the one it's
processing will continue to exist during recovery.

Both the mutex and spinlock are held while any protected field is
changed or a PE is deleted, so holding either of them (elsewhere) will
keep them stable and safe to access. Additionally, if
EEH_PE_RECOVERING is set on a PE then the locks can be released and
re-acquired safely, as long as the protected fields aren't used while
no locks are held. This is used during recovery to release locks
for long sleeps (i.e. during eeh_wait_state() when we may sleep up to
5 minutes), or to maintain lock ordering.

The spinlock is used in error detection (which cannot use a mutex, see
above) and also where it's possible that the mutex is already held.
The mutex is used in areas that don't have that restriction, and where
blocking may be required. Care must be taken when ordering these locks
against the PCI rescan/remove lock and the device locks to avoid
deadlocking.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h   |  12 +-
 arch/powerpc/kernel/eeh.c| 112 ++--
 arch/powerpc/kernel/eeh_driver.c | 288 ++-
 arch/powerpc/kernel/eeh_pe.c |  30 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  10 +-
 10 files changed, 365 insertions(+), 116 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 514dd056c2c8..95708c801f27 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -233,7 +233,7 @@ extern int eeh_subsystem_flags;
 extern u32 eeh_max_freezes;
 extern bool eeh_debugfs_no_recover;
 extern struct eeh_ops *eeh_ops;
-extern raw_spinlock_t confirm_error_lock;
+extern raw_spinlock_t eeh_pe_tree_spinlock;
 
 static inline void eeh_add_flag(int flag)
 {
@@ -257,12 +257,12 @@ static inline bool eeh_enabled(void)
 
 static inline void eeh_serialize_lock(unsigned long *flags)
 {
-   raw_spin_lock_irqsave(&confirm_error_lock, *flags);
+   raw_spin_lock_irqsave(&eeh_pe_tree_spinlock, *flags);
 }
 
 static inline void eeh_serialize_unlock(unsigned long flags)
 {
-   raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
+   raw_spin_unlock_irqrestore(&eeh_pe_tree_spinlock, flags);
 }
 
 static inline bool eeh_state_active(int state)
@@ -271,11 +271,15 @@ static inline bool eeh_state_active(int state)
== (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 }
 
+void eeh_recovery_lock(void);
+void eeh_recovery_unlock(void);
+void eeh_recovery_must_be_locked(void);
+
 typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
 typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
 void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
-int eeh_wait_state(struct eeh_pe *pe, int max_wait);
+int eeh_wait_state(struct eeh_pe *pe, int max_wait, bool unlock);
 struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
 struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no);

[RFC PATCH v2 2/3] powerpc/eeh: Provide a unique ID for each EEH recovery

2023-07-24 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

Give a unique ID to each recovery event, to ease log parsing
and prepare for parallel recovery.

Also add some new messages with a very simple format that may
be useful to log-parsers.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh_event.h |   3 +-
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c|  42 +++---
 arch/powerpc/kernel/eeh_driver.c | 189 +++
 arch/powerpc/kernel/eeh_event.c  |  12 +-
 include/linux/mmzone.h   |   2 +-
 6 files changed, 147 insertions(+), 103 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index dadde7d52f46..a1fe736bc4cf 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -17,13 +17,14 @@
 struct eeh_event {
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
+   unsigned intid; /* Event ID */
 };
 
 int eeh_event_init(void);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
-void eeh_handle_normal_event(struct eeh_pe *pe);
+void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/ppc-pci.h 
b/arch/powerpc/include/asm/ppc-pci.h
index d9fcff575027..5b82e76dbd19 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -40,7 +40,7 @@ extern int rtas_setup_phb(struct pci_controller *phb);
 void eeh_addr_cache_insert_dev(struct pci_dev *dev);
 void eeh_addr_cache_rmv_dev(struct pci_dev *dev);
 struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr);
-void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
+void eeh_slot_error_detail(unsigned int event_id, struct eeh_pe *pe, int 
severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
 int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed);
 void eeh_save_bars(struct eeh_dev *edev);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 6907722c6c1e..733fb290f4b7 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -194,7 +194,8 @@ EXPORT_SYMBOL_GPL(eeh_recovery_must_be_locked);
  * for the indicated PCI device, and puts them into a buffer
  * for RTAS error logging.
  */
-static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
+static size_t eeh_dump_dev_log(unsigned int event_id, struct eeh_dev *edev,
+  char *buf, size_t len)
 {
u32 cfg;
int cap, i;
@@ -204,27 +205,29 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
-   pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
+   pr_warn("EEH(%u): of node=%04x:%02x:%02x.%01x\n",
+   event_id,
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
 
eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, &cfg);
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
-   pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI device/vendor: %08x\n",event_id, cfg);
 
eeh_ops->read_config(edev, PCI_COMMAND, 4, &cfg);
n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
-   pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI cmd/status register: %08x\n", event_id, cfg);
 
/* Gather bridge-specific registers */
if (edev->mode & EEH_DEV_BRIDGE) {
eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, &cfg);
n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
-   pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge secondary status: %04x\n",
+   event_id, cfg);
 
eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &cfg);
n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
-   pr_warn("EEH: Bridge control: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge control: %04x\n", event_id, cfg);
}
 
/* Dump out the PCI-X command and status regs */
@@ -232,18 +235,19 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
if (cap) {

[RFC PATCH v2 3/3] powerpc/eeh: Asynchronous recovery

2023-07-24 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

Currently, EEH recovery is entirely serialized and takes place within
a single kernel thread. This can cause recovery to take a long time
when there are many devices.

To shorten recovery time, this change allows recovery to proceed in
parallel in two ways:
- Each PHB is given it's own recovery event queue and can be recovered
independently from other PHBs.
- Driver handlers are called in parallel, but with the constraint that
handlers higher up (closer to the PHB) in the PE hierarchy must be
called before those lower down.

To maintain the constraint, above, the driver handlers are called by
traversing the tree of affected PEs from the top, stopping to call
handlers (in parallel) when a PE with devices is discovered. When the
calls for that PE are complete, traversal continues at each child PE.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h|   1 +
 arch/powerpc/include/asm/eeh_event.h  |   7 +
 arch/powerpc/include/asm/pci-bridge.h |   4 +
 arch/powerpc/kernel/eeh.c |   5 -
 arch/powerpc/kernel/eeh_driver.c  | 323 +++---
 arch/powerpc/kernel/eeh_event.c   |  69 +++---
 arch/powerpc/kernel/eeh_pe.c  |   4 +
 7 files changed, 294 insertions(+), 119 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 95708c801f27..a99472635350 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -128,6 +128,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 #define EEH_DEV_NO_HANDLER (1 << 8)/* No error handler */
 #define EEH_DEV_SYSFS  (1 << 9)/* Sysfs created*/
 #define EEH_DEV_REMOVED(1 << 10)   /* Removed permanently  
*/
+#define EEH_DEV_RECOVERING (1 << 11)   /* Recovering   */
 
 struct eeh_dev {
int mode;   /* EEH mode */
diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index a1fe736bc4cf..b21f49e87b7b 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -8,6 +8,8 @@
 #define ASM_POWERPC_EEH_EVENT_H
 #ifdef __KERNEL__
 
+#include 
+
 /*
  * structure holding pci controller data that describes a
  * change in the isolation status of a PCI slot.  A pointer
@@ -15,16 +17,21 @@
  * callback.
  */
 struct eeh_event {
+   struct work_struct  work;
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
unsigned intid; /* Event ID */
 };
 
+extern spinlock_t eeh_eventlist_lock;
+
 int eeh_event_init(void);
+int eeh_phb_event(struct eeh_pe *pe);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
+void eeh_handle_normal_event_work(struct work_struct *work);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 2aa3a091ef20..61884d9398bf 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -138,6 +138,10 @@ struct pci_controller {
 
/* iommu_ops support */
struct iommu_device iommu;
+
+   bool eeh_in_progress;
+   struct list_head eeh_eventlist;
+   spinlock_t eeh_eventlist_lock;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 733fb290f4b7..12536d892826 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -579,11 +579,6 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 * bridges.
 */
eeh_pe_mark_isolated(pe);
-
-   /* Most EEH events are due to device driver bugs.  Having
-* a stack trace will help the device-driver authors figure
-* out what happened.  So print that out.
-*/
pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n",
__func__, pe->phb->global_number, pe->addr);
eeh_send_failure_event(pe);
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index cdf2de0eba57..49f8b99dfb25 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -12,12 +12,17 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 
+static atomic_t eeh_wu_id = ATOMIC_INIT(0);
+
 struct eeh_rmv_data {
struct list_head removed_vf_list;
int removed_dev_count;
@@ -248,73 +253,59 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool 
enable)
 }
 
 typedef enum pci_e

[RFC 0/3] Asynchronous EEH recovery

2023-06-12 Thread Ganesh Goudar
Hi,

EEH recovery is currently serialized and these patches shorten
the time taken for EEH recovery by making the recovery to run
in parallel. The original author of these patches is Sam Bobroff,
I have rebased and tested these patches.

On powervm with 64 VFs from same PHB,  I see approximately 48%
reduction in time taken in EEH recovery.

On powernv with 9 network cards, Where 2 cards installed on one
PHB and 1 card on each of the rest of the PHBs, Providing 20 PFs
in total. I see approximately 33% reduction in time taken in EEH
recovery.

These patches were originally posted as separate RFCs by Sam, And
I rebased and posted these patches almost a year back, I stopped
pursuing these patches as I was not able test this on powernv, Due
to the issues in drivers of cards I was testing this on, Which are
now resolved. Since I am re-posting this after long time, Posting
this as a fresh RFC, Please comment.

Thanks.  

Ganesh Goudar (3):
  powerpc/eeh: Synchronization for safety
  powerpc/eeh: Provide a unique ID for each EEH recovery
  powerpc/eeh: Asynchronous recovery

 arch/powerpc/include/asm/eeh.h   |   7 +-
 arch/powerpc/include/asm/eeh_event.h |  10 +-
 arch/powerpc/include/asm/pci-bridge.h|   3 +
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c| 154 +++--
 arch/powerpc/kernel/eeh_driver.c | 580 +++
 arch/powerpc/kernel/eeh_event.c  |  71 ++-
 arch/powerpc/kernel/eeh_pe.c |  33 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  10 +-
 include/linux/mmzone.h   |   2 +-
 15 files changed, 687 insertions(+), 214 deletions(-)

-- 
2.40.1



[RFC 1/3] powerpc/eeh: Synchronization for safety

2023-06-12 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

There is currently little synchronization between EEH error detection
(eeh_dev_check_failure()), EEH error recovery
(eeh_handle_{normal,special}_event()) and the PCI subsystem (device
addition and removal), and so there are race conditions that lead to
crashes (often access to free'd memory or LIST_POISON).

However, a solution must consider:
- EEH error detection can occur in interrupt context, which prevents
the use of a mutex.
- EEH recovery may need to sleep, which prevents the use of a spinlock.
- EEH recovery uses PCI operations that may require the PCI
rescan/remove lock and/or device lock to be held
- PCI operations may hold the rescan/remove and/or device lock when
calling into EEH functions.
- Device driver callbacks may perform arbitrary PCI operations
during recovery, including device removal.

In this patch the existing mutex and spinlock are combined with the
EEH_PE_RECOVERING flag to provide some assurances that are then used
to reduce the race conditions.

The fields to be protected are the ones that provide the structure
of the trees of struct eeh_pe that are held for each PHB: the parent
pointer and child lists and the list of struct eeh_dev, as well as
the pe and pdev pointers within struct eeh_dev.

The existing way of using EEH_PE_RECOVERING is kept and slightly
extended: No struct eeh_pe will be removed while it has the flag set
on it. Additionally, when adding new PEs, they are marked
EEH_PE_RECOVERING if their parent PE is marked: this allows the
recovery thread to assume that all PEs underneath the one it's
processing will continue to exist during recovery.

Both the mutex and spinlock are held while any protected field is
changed or a PE is deleted, so holding either of them (elsewhere) will
keep them stable and safe to access. Additionally, if
EEH_PE_RECOVERING is set on a PE then the locks can be released and
re-acquired safely, as long as the protected fields aren't used while
no locks are held. This is used during recovery to release locks
for long sleeps (i.e. during eeh_wait_state() when we may sleep up to
5 minutes), or to maintain lock ordering.

The spinlock is used in error detection (which cannot use a mutex, see
above) and also where it's possible that the mutex is already held.
The mutex is used in areas that don't have that restriction, and where
blocking may be required. Care must be taken when ordering these locks
against the PCI rescan/remove lock and the device locks to avoid
deadlocking.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h   |   6 +-
 arch/powerpc/kernel/eeh.c| 112 ++--
 arch/powerpc/kernel/eeh_driver.c | 288 ++-
 arch/powerpc/kernel/eeh_pe.c |  30 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_iommu_spapr_tce.c  |  10 +-
 10 files changed, 365 insertions(+), 110 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 514dd056c2c8..d0f09e691498 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -271,11 +271,15 @@ static inline bool eeh_state_active(int state)
== (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 }
 
+void eeh_recovery_lock(void);
+void eeh_recovery_unlock(void);
+void eeh_recovery_must_be_locked(void);
+
 typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
 typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
 void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
-int eeh_wait_state(struct eeh_pe *pe, int max_wait);
+int eeh_wait_state(struct eeh_pe *pe, int max_wait, bool unlock);
 struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
 struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ab316e155ea9..2c90c37524ed 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -108,7 +108,25 @@ bool eeh_debugfs_no_recover;
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
 
-/* Lock to avoid races due to multiple reports of an error */
+/*
+ * confirm_error_lock and eeh_dev_mutex are used together to provide
+ * safety during EEH operations.
+ *
+ * Generally, the spinlock is used in error detection where it's not possible
+ * to use a mutex or where there is potential to deadlock with the mutex, and
+ * the mutex is used during recovery and other PCI related operations. One must
+ * be held when reading and both must be held when making changes to the
+ * p

[RFC 3/3] powerpc/eeh: Asynchronous recovery

2023-06-12 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

Currently, EEH recovery is entirely serialized and takes place within
a single kernel thread. This can cause recovery to take a long time
when there are many devices.

To shorten recovery time, this change allows recovery to proceed in
parallel in two ways:
- Each PHB is given it's own recovery event queue and can be recovered
independently from other PHBs.
- Driver handlers are called in parallel, but with the constraint that
handlers higher up (closer to the PHB) in the PE hierarchy must be
called before those lower down.

To maintain the constraint, above, the driver handlers are called by
traversing the tree of affected PEs from the top, stopping to call
handlers (in parallel) when a PE with devices is discovered. When the
calls for that PE are complete, traversal continues at each child PE.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h|   1 +
 arch/powerpc/include/asm/eeh_event.h  |   7 +
 arch/powerpc/include/asm/pci-bridge.h |   3 +
 arch/powerpc/kernel/eeh_driver.c  | 323 +++---
 arch/powerpc/kernel/eeh_event.c   |  65 +++---
 arch/powerpc/kernel/eeh_pe.c  |   3 +
 6 files changed, 288 insertions(+), 114 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index d0f09e691498..06d7dabdccfe 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -128,6 +128,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 #define EEH_DEV_NO_HANDLER (1 << 8)/* No error handler */
 #define EEH_DEV_SYSFS  (1 << 9)/* Sysfs created*/
 #define EEH_DEV_REMOVED(1 << 10)   /* Removed permanently  
*/
+#define EEH_DEV_RECOVERING (1 << 11)   /* Recovering   */
 
 struct eeh_dev {
int mode;   /* EEH mode */
diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index a1fe736bc4cf..b21f49e87b7b 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -8,6 +8,8 @@
 #define ASM_POWERPC_EEH_EVENT_H
 #ifdef __KERNEL__
 
+#include 
+
 /*
  * structure holding pci controller data that describes a
  * change in the isolation status of a PCI slot.  A pointer
@@ -15,16 +17,21 @@
  * callback.
  */
 struct eeh_event {
+   struct work_struct  work;
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
unsigned intid; /* Event ID */
 };
 
+extern spinlock_t eeh_eventlist_lock;
+
 int eeh_event_init(void);
+int eeh_phb_event(struct eeh_pe *pe);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
+void eeh_handle_normal_event_work(struct work_struct *work);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 2aa3a091ef20..55a5ff9ae30b 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -138,6 +138,9 @@ struct pci_controller {
 
/* iommu_ops support */
struct iommu_device iommu;
+
+   bool eeh_in_progress;
+   struct list_head eeh_eventlist;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index cdf2de0eba57..a484d6ef33a1 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -12,12 +12,17 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 
+static atomic_t eeh_wu_id = ATOMIC_INIT(0);
+
 struct eeh_rmv_data {
struct list_head removed_vf_list;
int removed_dev_count;
@@ -248,73 +253,59 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool 
enable)
 }
 
 typedef enum pci_ers_result (*eeh_report_fn)(unsigned int event_id,
+unsigned int id,
 struct pci_dev *,
 struct pci_driver *);
 static void eeh_pe_report_pdev(unsigned int event_id,
-  struct pci_dev *pdev, eeh_report_fn fn,
+  unsigned int id,
+  struct pci_dev *pdev,
+  const char *fn_name, eeh_report_fn fn,
   enum pci_ers_result *result,
-  const char *handler_name)
+  bool late, bool removed, bool passed)
 {
-   struct eeh_dev *edev;
struct pci

[RFC 2/3] powerpc/eeh: Provide a unique ID for each EEH recovery

2023-06-12 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

Give a unique ID to each recovery event, to ease log parsing
and prepare for parallel recovery.

Also add some new messages with a very simple format that may
be useful to log-parsers.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh_event.h |   3 +-
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c|  42 +++---
 arch/powerpc/kernel/eeh_driver.c | 189 +++
 arch/powerpc/kernel/eeh_event.c  |  12 +-
 include/linux/mmzone.h   |   2 +-
 6 files changed, 147 insertions(+), 103 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index dadde7d52f46..a1fe736bc4cf 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -17,13 +17,14 @@
 struct eeh_event {
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
+   unsigned intid; /* Event ID */
 };
 
 int eeh_event_init(void);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
-void eeh_handle_normal_event(struct eeh_pe *pe);
+void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/ppc-pci.h 
b/arch/powerpc/include/asm/ppc-pci.h
index d9fcff575027..5b82e76dbd19 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -40,7 +40,7 @@ extern int rtas_setup_phb(struct pci_controller *phb);
 void eeh_addr_cache_insert_dev(struct pci_dev *dev);
 void eeh_addr_cache_rmv_dev(struct pci_dev *dev);
 struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr);
-void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
+void eeh_slot_error_detail(unsigned int event_id, struct eeh_pe *pe, int 
severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
 int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed);
 void eeh_save_bars(struct eeh_dev *edev);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 2c90c37524ed..148d5df0e606 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -200,7 +200,8 @@ EXPORT_SYMBOL_GPL(eeh_recovery_must_be_locked);
  * for the indicated PCI device, and puts them into a buffer
  * for RTAS error logging.
  */
-static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
+static size_t eeh_dump_dev_log(unsigned int event_id, struct eeh_dev *edev,
+  char *buf, size_t len)
 {
u32 cfg;
int cap, i;
@@ -210,27 +211,29 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
-   pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
+   pr_warn("EEH(%u): of node=%04x:%02x:%02x.%01x\n",
+   event_id,
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
 
eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, &cfg);
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
-   pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI device/vendor: %08x\n",event_id, cfg);
 
eeh_ops->read_config(edev, PCI_COMMAND, 4, &cfg);
n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
-   pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI cmd/status register: %08x\n", event_id, cfg);
 
/* Gather bridge-specific registers */
if (edev->mode & EEH_DEV_BRIDGE) {
eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, &cfg);
n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
-   pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge secondary status: %04x\n",
+   event_id, cfg);
 
eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &cfg);
n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
-   pr_warn("EEH: Bridge control: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge control: %04x\n", event_id, cfg);
}
 
/* Dump out the PCI-X command and status regs */
@@ -238,18 +241,19 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
if (cap) {

[PATCH] powerpc/pseries/mce: Avoid instrumentation in realmode

2022-08-29 Thread Ganesh Goudar
Part of machine check error handling is done in realmode,
As of now instrumentation is not possible for any code that
runs in realmode.
When MCE is injected on KASAN enabled kernel, crash is
observed, Hence force inline or mark no instrumentation
for functions which can run in realmode to avoid KASAN
instrumentation.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/interrupt.h | 2 +-
 arch/powerpc/include/asm/rtas.h  | 4 ++--
 arch/powerpc/kernel/rtas.c   | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 8069dbc4b8d1..090895051712 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -92,7 +92,7 @@ static inline bool is_implicit_soft_masked(struct pt_regs 
*regs)
return search_kernel_soft_mask_table(regs->nip);
 }
 
-static inline void srr_regs_clobbered(void)
+static __always_inline void srr_regs_clobbered(void)
 {
local_paca->srr_valid = 0;
local_paca->hsrr_valid = 0;
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 00531af17ce0..52d29d664fdf 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -201,13 +201,13 @@ inline uint32_t rtas_ext_event_company_id(struct 
rtas_ext_event_log_v6 *ext_log)
 #define PSERIES_ELOG_SECT_ID_MCE   (('M' << 8) | 'C')
 
 static
-inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->id);
 }
 
 static
-inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->length);
 }
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 693133972294..f9d78245c0e8 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -48,7 +48,7 @@
 /* This is here deliberately so it's only used in this file */
 void enter_rtas(unsigned long);
 
-static inline void do_enter_rtas(unsigned long args)
+static __always_inline void do_enter_rtas(unsigned long args)
 {
unsigned long msr;
 
@@ -435,7 +435,7 @@ static char *__fetch_rtas_last_error(char *altbuf)
 #endif
 
 
-static void
+noinstr static void
 va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret,
  va_list list)
 {
-- 
2.37.1



[PACTH v2] powerpc/pseries/mce: Avoid instrumentation in realmode

2022-09-04 Thread Ganesh Goudar
Part of machine check error handling is done in realmode,
As of now instrumentation is not possible for any code that
runs in realmode.
When MCE is injected on KASAN enabled kernel, crash is
observed, Hence force inline or mark no instrumentation
for functions which can run in realmode, to avoid KASAN
instrumentation.

Signed-off-by: Ganesh Goudar 
---
v2: Force inline few more functions.
---
 arch/powerpc/include/asm/hw_irq.h| 8 
 arch/powerpc/include/asm/interrupt.h | 2 +-
 arch/powerpc/include/asm/rtas.h  | 4 ++--
 arch/powerpc/kernel/rtas.c   | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index 26ede09c521d..3264991fe524 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -111,7 +111,7 @@ static inline void __hard_RI_enable(void)
 #ifdef CONFIG_PPC64
 #include 
 
-static inline notrace unsigned long irq_soft_mask_return(void)
+static __always_inline notrace unsigned long irq_soft_mask_return(void)
 {
return READ_ONCE(local_paca->irq_soft_mask);
 }
@@ -121,7 +121,7 @@ static inline notrace unsigned long 
irq_soft_mask_return(void)
  * for the critical section and as a clobber because
  * we changed paca->irq_soft_mask
  */
-static inline notrace void irq_soft_mask_set(unsigned long mask)
+static __always_inline notrace void irq_soft_mask_set(unsigned long mask)
 {
/*
 * The irq mask must always include the STD bit if any are set.
@@ -144,7 +144,7 @@ static inline notrace void irq_soft_mask_set(unsigned long 
mask)
barrier();
 }
 
-static inline notrace unsigned long irq_soft_mask_set_return(unsigned long 
mask)
+static __always_inline notrace unsigned long irq_soft_mask_set_return(unsigned 
long mask)
 {
unsigned long flags = irq_soft_mask_return();
 
@@ -162,7 +162,7 @@ static inline notrace unsigned long 
irq_soft_mask_or_return(unsigned long mask)
return flags;
 }
 
-static inline unsigned long arch_local_save_flags(void)
+static __always_inline unsigned long arch_local_save_flags(void)
 {
return irq_soft_mask_return();
 }
diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 8069dbc4b8d1..090895051712 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -92,7 +92,7 @@ static inline bool is_implicit_soft_masked(struct pt_regs 
*regs)
return search_kernel_soft_mask_table(regs->nip);
 }
 
-static inline void srr_regs_clobbered(void)
+static __always_inline void srr_regs_clobbered(void)
 {
local_paca->srr_valid = 0;
local_paca->hsrr_valid = 0;
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 00531af17ce0..52d29d664fdf 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -201,13 +201,13 @@ inline uint32_t rtas_ext_event_company_id(struct 
rtas_ext_event_log_v6 *ext_log)
 #define PSERIES_ELOG_SECT_ID_MCE   (('M' << 8) | 'C')
 
 static
-inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->id);
 }
 
 static
-inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->length);
 }
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 693133972294..f9d78245c0e8 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -48,7 +48,7 @@
 /* This is here deliberately so it's only used in this file */
 void enter_rtas(unsigned long);
 
-static inline void do_enter_rtas(unsigned long args)
+static __always_inline void do_enter_rtas(unsigned long args)
 {
unsigned long msr;
 
@@ -435,7 +435,7 @@ static char *__fetch_rtas_last_error(char *altbuf)
 #endif
 
 
-static void
+noinstr static void
 va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret,
  va_list list)
 {
-- 
2.37.1



[PATCH v3] powerpc/pseries/mce: Avoid instrumentation in realmode

2022-09-25 Thread Ganesh Goudar
Part of machine check error handling is done in realmode,
As of now instrumentation is not possible for any code that
runs in realmode.
When MCE is injected on KASAN enabled kernel, crash is
observed, Hence force inline or mark no instrumentation
for functions which can run in realmode, to avoid KASAN
instrumentation.

Signed-off-by: Ganesh Goudar 
---
v2: Force inline few more functions.

v3: Adding noinstr to few functions instead of __always_inline.
---
 arch/powerpc/include/asm/hw_irq.h| 8 
 arch/powerpc/include/asm/interrupt.h | 2 +-
 arch/powerpc/include/asm/rtas.h  | 4 ++--
 arch/powerpc/kernel/rtas.c   | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index 983551859891..c4d542b4a623 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -111,7 +111,7 @@ static inline void __hard_RI_enable(void)
 #ifdef CONFIG_PPC64
 #include 
 
-static inline notrace unsigned long irq_soft_mask_return(void)
+noinstr static unsigned long irq_soft_mask_return(void)
 {
unsigned long flags;
 
@@ -128,7 +128,7 @@ static inline notrace unsigned long 
irq_soft_mask_return(void)
  * for the critical section and as a clobber because
  * we changed paca->irq_soft_mask
  */
-static inline notrace void irq_soft_mask_set(unsigned long mask)
+noinstr static void irq_soft_mask_set(unsigned long mask)
 {
/*
 * The irq mask must always include the STD bit if any are set.
@@ -155,7 +155,7 @@ static inline notrace void irq_soft_mask_set(unsigned long 
mask)
: "memory");
 }
 
-static inline notrace unsigned long irq_soft_mask_set_return(unsigned long 
mask)
+noinstr static unsigned long irq_soft_mask_set_return(unsigned long mask)
 {
unsigned long flags;
 
@@ -191,7 +191,7 @@ static inline notrace unsigned long 
irq_soft_mask_or_return(unsigned long mask)
return flags;
 }
 
-static inline unsigned long arch_local_save_flags(void)
+static __always_inline unsigned long arch_local_save_flags(void)
 {
return irq_soft_mask_return();
 }
diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 8069dbc4b8d1..090895051712 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -92,7 +92,7 @@ static inline bool is_implicit_soft_masked(struct pt_regs 
*regs)
return search_kernel_soft_mask_table(regs->nip);
 }
 
-static inline void srr_regs_clobbered(void)
+static __always_inline void srr_regs_clobbered(void)
 {
local_paca->srr_valid = 0;
local_paca->hsrr_valid = 0;
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 00531af17ce0..52d29d664fdf 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -201,13 +201,13 @@ inline uint32_t rtas_ext_event_company_id(struct 
rtas_ext_event_log_v6 *ext_log)
 #define PSERIES_ELOG_SECT_ID_MCE   (('M' << 8) | 'C')
 
 static
-inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_id(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->id);
 }
 
 static
-inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
+__always_inline uint16_t pseries_errorlog_length(struct pseries_errorlog *sect)
 {
return be16_to_cpu(sect->length);
 }
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 693133972294..f9d78245c0e8 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -48,7 +48,7 @@
 /* This is here deliberately so it's only used in this file */
 void enter_rtas(unsigned long);
 
-static inline void do_enter_rtas(unsigned long args)
+static __always_inline void do_enter_rtas(unsigned long args)
 {
unsigned long msr;
 
@@ -435,7 +435,7 @@ static char *__fetch_rtas_last_error(char *altbuf)
 #endif
 
 
-static void
+noinstr static void
 va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret,
  va_list list)
 {
-- 
2.37.1



[PATCH 0/1] Parallel EEH recovery between PHBs

2024-02-25 Thread Ganesh Goudar
This change is based on Sam Bobroff's patches which aimed
to allow recovery to happen in parallel between PHBs and PEs,
Due to various reasons the patches did not get in.

But having parallel recovery between PHBs is fairly simple and
gives significant improvement on powervm, Since powervm maintains
flat hierarchy for PCI devices.
This patch enables PHBs to have separate event queues and shorten
the time taken for EEH recovery by making the recovery to run in
parallel between PHBs.

On powervm with 64 VFs from same PHB,  I see approximately 48%
reduction in time taken in EEH recovery. On powernv the improvement
is not so significant.

Ganesh Goudar (1):
  powerpc/eeh: Enable PHBs to recovery in parallel

 arch/powerpc/include/asm/eeh_event.h  |  7 +
 arch/powerpc/include/asm/pci-bridge.h |  4 +++
 arch/powerpc/kernel/eeh_driver.c  | 27 +--
 arch/powerpc/kernel/eeh_event.c   | 38 ++-
 arch/powerpc/kernel/eeh_pe.c  |  4 +++
 5 files changed, 77 insertions(+), 3 deletions(-)

-- 
2.43.2



[PATCH 1/1] powerpc/eeh: Enable PHBs to recovery in parallel

2024-02-25 Thread Ganesh Goudar
Currnetly, With a single event queue EEH recovery is entirely
serialized and takes place within a single kernel thread. This
can cause recovery to take a long time when there are many
devices.

Have the recovery event queue per PHB and allow the recovery to
happen independently from other PHBs.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh_event.h  |  7 +
 arch/powerpc/include/asm/pci-bridge.h |  4 +++
 arch/powerpc/kernel/eeh_driver.c  | 27 +--
 arch/powerpc/kernel/eeh_event.c   | 38 ++-
 arch/powerpc/kernel/eeh_pe.c  |  4 +++
 5 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index dadde7d52f46..6af1b5bb6103 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -8,6 +8,8 @@
 #define ASM_POWERPC_EEH_EVENT_H
 #ifdef __KERNEL__
 
+#include 
+
 /*
  * structure holding pci controller data that describes a
  * change in the isolation status of a PCI slot.  A pointer
@@ -15,15 +17,20 @@
  * callback.
  */
 struct eeh_event {
+   struct work_struct  work;
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
 };
 
+extern spinlock_t eeh_eventlist_lock;
+
 int eeh_event_init(void);
+int eeh_phb_event(struct eeh_pe *pe);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(struct eeh_pe *pe);
+void eeh_handle_normal_event_work(struct work_struct *work);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 2aa3a091ef20..61884d9398bf 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -138,6 +138,10 @@ struct pci_controller {
 
/* iommu_ops support */
struct iommu_device iommu;
+
+   bool eeh_in_progress;
+   struct list_head eeh_eventlist;
+   spinlock_t eeh_eventlist_lock;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 48773d2d9be3..d5612303766e 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -1107,6 +1107,30 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
 }
 
+void eeh_handle_normal_event_work(struct work_struct *work)
+{
+   unsigned long flags;
+   struct eeh_event *event = container_of(work, struct eeh_event, work);
+   struct pci_controller *phb = event->pe->phb;
+
+   eeh_handle_normal_event(event->pe);
+
+   kfree(event);
+   spin_lock_irqsave(&phb->eeh_eventlist_lock, flags);
+   WARN_ON_ONCE(!phb->eeh_in_progress);
+   if (list_empty(&phb->eeh_eventlist)) {
+   phb->eeh_in_progress = false;
+   pr_debug("EEH: No more work to do\n");
+   } else {
+   pr_warn("EEH: More work to do\n");
+   event = list_entry(phb->eeh_eventlist.next,
+  struct eeh_event, list);
+   list_del(&event->list);
+   queue_work(system_unbound_wq, &event->work);
+   }
+   spin_unlock_irqrestore(&phb->eeh_eventlist_lock, flags);
+}
+
 /**
  * eeh_handle_special_event - Handle EEH events without a specific failing PE
  *
@@ -1176,8 +1200,7 @@ void eeh_handle_special_event(void)
 */
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
rc == EEH_NEXT_ERR_FENCED_PHB) {
-   eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
-   eeh_handle_normal_event(pe);
+   eeh_phb_event(pe);
} else {
eeh_for_each_pe(pe, tmp_pe)
eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index c23a454af08a..86c0a988389e 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -22,7 +22,7 @@
  *  work-queue, where a worker thread can drive recovery.
  */
 
-static DEFINE_SPINLOCK(eeh_eventlist_lock);
+DEFINE_SPINLOCK(eeh_eventlist_lock);
 static DECLARE_COMPLETION(eeh_eventlist_event);
 static LIST_HEAD(eeh_eventlist);
 
@@ -91,6 +91,42 @@ int eeh_event_init(void)
return 0;
 }
 
+int eeh_phb_event(struct eeh_pe *pe)
+{
+   struct eeh_event *event;
+   unsigned long flags;
+   struct pci_controller *phb;
+
+   event = kzalloc(sizeof(*event), GFP_ATOMIC);
+   if (!event)
+   return -ENOMEM;
+
+   if (pe) {
+   phb = pe->

[PATCH v2] powerpc/mce: log the error for all unrecoverable errors

2023-01-27 Thread Ganesh Goudar
For all unrecoverable errors we are missing to log the
error, Since machine_check_log_err() is not getting called
for unrecoverable errors.

Raise irq work in save_mce_event() for unrecoverable errors,
So that we log the error from MCE event handling block in
timer handler.

Log without this change

 MCE: CPU27: machine check (Severe)  Real address Load/Store (foreign/control 
memory) [Not recovered]
 MCE: CPU27: PID: 10580 Comm: inject-ra-err NIP: [1df4]
 MCE: CPU27: Initiator CPU
 MCE: CPU27: Unknown

Log with this change

 MCE: CPU24: machine check (Severe)  Real address Load/Store (foreign/control 
memory) [Not recovered]
 MCE: CPU24: PID: 1589811 Comm: inject-ra-err NIP: [1e48]
 MCE: CPU24: Initiator CPU
 MCE: CPU24: Unknown
 RTAS: event: 5, Type: Platform Error (224), Severity: 3

Signed-off-by: Ganesh Goudar 
Reviewed-by: Mahesh Salgaonkar 
---
V2: Rephrasing the commit message.
---
 arch/powerpc/kernel/mce.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 6c5d30fba766..a1cb2172eb7b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   /*
+* Raise irq work, So that we don't miss to log the error for
+* unrecoverable errors.
+*/
+   if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED)
+   mce_irq_work_queue();
+
if (!addr)
return;
 
@@ -235,7 +242,6 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   mce_irq_work_queue();
 }
 
 /*
-- 
2.38.1



[PATCH v3] powerpc/mce: log the error for all unrecoverable errors

2023-02-01 Thread Ganesh Goudar
For all unrecoverable errors we are missing to log the
error, Since machine_check_log_err() is not getting called
for unrecoverable errors. machine_check_log_err() is called
from deferred handler, To run deferred handlers we have to do
irq work raise from the exception handler.

For recoverable errors exception vector code takes care of
running deferred handlers.

For unrecoverable errors raise irq work in save_mce_event(),
So that we log the error from MCE deferred handler.

Log without this change

 MCE: CPU27: machine check (Severe)  Real address Load/Store (foreign/control 
memory) [Not recovered]
 MCE: CPU27: PID: 10580 Comm: inject-ra-err NIP: [1df4]
 MCE: CPU27: Initiator CPU
 MCE: CPU27: Unknown

Log with this change

 MCE: CPU24: machine check (Severe)  Real address Load/Store (foreign/control 
memory) [Not recovered]
 MCE: CPU24: PID: 1589811 Comm: inject-ra-err NIP: [1e48]
 MCE: CPU24: Initiator CPU
 MCE: CPU24: Unknown
 RTAS: event: 5, Type: Platform Error (224), Severity: 3

Signed-off-by: Ganesh Goudar 
Reviewed-by: Mahesh Salgaonkar 
---
V3: Rephrasing the commit message.
---
 arch/powerpc/kernel/mce.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 6c5d30fba766..219f28637a3e 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   /*
+* Raise irq work, So that we don't miss to log the error for
+* unrecoverable errors.
+*/
+   if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED)
+   mce_irq_work_queue();
+
if (!addr)
return;
 
@@ -233,9 +240,6 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
}
memcpy(&local_paca->mce_info->mce_ue_event_queue[index],
   evt, sizeof(*evt));
-
-   /* Queue work to process this event later. */
-   mce_irq_work_queue();
 }
 
 /*
-- 
2.39.1



[PATCH] powerpc/eeh: Set channel state after notifying the drivers

2023-02-09 Thread Ganesh Goudar
When a PCI error is encountered 6th time in an hour we
set the channel state to perm_failure and notify the
driver about the permanent failure.

However, after upstream commit 38ddc011478e ("powerpc/eeh:
Make permanently failed devices non-actionable"), EEH handler
stops calling any routine once the device is marked as
permanent failure. This issue can lead to fatal consequences
like kernel hang with certain PCI devices.

Following log is observed with lpfc driver, with and without
this change, Without this change kernel hangs, If PCI error
is encountered 6 times for a device in an hour.

Without the change

 EEH: Beginning: 'error_detected(permanent failure)'
 PCI 0132:60:00.0#60: EEH: not actionable (1,1,1)
 PCI 0132:60:00.1#60: EEH: not actionable (1,1,1)
 EEH: Finished:'error_detected(permanent failure)'

With the change

 EEH: Beginning: 'error_detected(permanent failure)'
 EEH: Invoking lpfc->error_detected(permanent failure)
 EEH: lpfc driver reports: 'disconnect'
 EEH: Invoking lpfc->error_detected(permanent failure)
 EEH: lpfc driver reports: 'disconnect'
 EEH: Finished:'error_detected(permanent failure)'

To fix the issue, set channel state to permanent failure after
notifying the drivers.

Fixes: 38ddc011478e ("powerpc/eeh: Make permanently failed devices 
non-actionable")
Suggested-by: Mahesh Salgaonkar 
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/eeh_driver.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index f279295179bd..438568a472d0 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -1065,10 +1065,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
eeh_slot_error_detail(pe, EEH_LOG_PERM);
 
/* Notify all devices that they're about to go down. */
-   eeh_set_channel_state(pe, pci_channel_io_perm_failure);
eeh_set_irq_state(pe, false);
eeh_pe_report("error_detected(permanent failure)", pe,
  eeh_report_failure, NULL);
+   eeh_set_channel_state(pe, pci_channel_io_perm_failure);
 
/* Mark the PE to be removed permanently */
eeh_pe_state_mark(pe, EEH_PE_REMOVED);
@@ -1185,10 +1185,10 @@ void eeh_handle_special_event(void)
 
/* Notify all devices to be down */
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
-   eeh_set_channel_state(pe, pci_channel_io_perm_failure);
eeh_pe_report(
"error_detected(permanent failure)", pe,
eeh_report_failure, NULL);
+   eeh_set_channel_state(pe, pci_channel_io_perm_failure);
 
pci_lock_rescan_remove();
list_for_each_entry(hose, &hose_list, list_node) {
-- 
2.39.1



[PATCH] powerpc/eeh: Permanently disable the removed device

2024-04-05 Thread Ganesh Goudar
When a device is hot removed on powernv, the hotplug
driver clears the device's state. However, on pseries,
if a device is removed by phyp after reaching the error
threshold, the kernel remains unaware, leading to the
device not being torn down. This prevents necessary
remediation actions like failover.

Permanently disable the device if the presence check
fails.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/eeh.c| 4 +++-
 arch/powerpc/kernel/eeh_driver.c | 8 +++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ab316e155ea9..8d1606406d3f 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -508,7 +508,9 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 * state, PE is in good state.
 */
if ((ret < 0) ||
-   (ret == EEH_STATE_NOT_SUPPORT) || eeh_state_active(ret)) {
+   (ret == EEH_STATE_NOT_SUPPORT &&
+dev->error_state == pci_channel_io_perm_failure) ||
+   eeh_state_active(ret)) {
eeh_stats.false_positives++;
pe->false_positives++;
rc = 0;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 48773d2d9be3..10317badf471 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -867,7 +867,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
if (!devices) {
pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
pe->phb->global_number, pe->addr);
-   goto out; /* nothing to recover */
+   /*
+* The device is removed, Tear down its state,
+* On powernv hotplug driver would take care of
+* it but not on pseries, Permanently disable the
+* card as it is hot removed.
+*/
+   goto recover_failed;
}
 
/* Log the event */
-- 
2.44.0



[PATCH v2] powerpc/eeh: Permanently disable the removed device

2024-04-22 Thread Ganesh Goudar
When a device is hot removed on powernv, the hotplug driver clears
the device's state. However, on pseries, if a device is removed by
phyp after reaching the error threshold, the kernel remains unaware,
leading to the device not being torn down. This prevents necessary
remediation actions like failover.

Permanently disable the device if the presence check fails.

Also, in eeh_dev_check_failure in we may consider the error as false
positive if the device is hotpluged out as the get_state call returns
EEH_STATE_NOT_SUPPORT and we may end up not clearing the device state,
so log the event if the state is not moved to permanent failure state.

Signed-off-by: Ganesh Goudar 
---
V2:
* Elobrate the commit message.
* Fix formatting issues in commit message and comments.
---
 arch/powerpc/kernel/eeh.c| 11 ++-
 arch/powerpc/kernel/eeh_driver.c | 13 +++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ab316e155ea9..6670063a7a6c 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -506,9 +506,18 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 * We will punt with the following conditions: Failure to get
 * PE's state, EEH not support and Permanently unavailable
 * state, PE is in good state.
+*
+* On the pSeries, after reaching the threshold, get_state might
+* return EEH_STATE_NOT_SUPPORT. However, it's possible that the
+* device state remains uncleared if the device is not marked
+* pci_channel_io_perm_failure. Therefore, consider logging the
+* event to let device removal happen.
+*
 */
if ((ret < 0) ||
-   (ret == EEH_STATE_NOT_SUPPORT) || eeh_state_active(ret)) {
+   (ret == EEH_STATE_NOT_SUPPORT &&
+dev->error_state == pci_channel_io_perm_failure) ||
+   eeh_state_active(ret)) {
eeh_stats.false_positives++;
pe->false_positives++;
rc = 0;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 48773d2d9be3..7efe04c68f0f 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -865,9 +865,18 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
devices++;
 
if (!devices) {
-   pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
+   pr_warn("EEH: Frozen PHB#%x-PE#%x is empty!\n",
pe->phb->global_number, pe->addr);
-   goto out; /* nothing to recover */
+   /*
+* The device is removed, tear down its state, on powernv
+* hotplug driver would take care of it but not on pseries,
+* permanently disable the card as it is hot removed.
+*
+* In the case of powernv, note that the removal of device
+* is covered by pci rescan lock, so no problem even if hotplug
+* driver attempts to remove the device.
+*/
+   goto recover_failed;
}
 
/* Log the event */
-- 
2.44.0



[PATCH 1/3] powerpc/mce: remove nmi_enter/exit from real mode handler

2020-09-16 Thread Ganesh Goudar
Use of nmi_enter/exit in real mode handler causes the kernel to panic
and reboot on injecting slb mutihit on pseries machine running in hash
mmu mode, As these calls try to accesses memory outside RMO region in
real mode handler where translation is disabled.

Add check to not to use these calls on pseries machine running in hash
mmu mode.

Fixes: 116ac378bb3f ("powerpc/64s: machine check interrupt update NMI 
accounting")
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index ada59f6c4298..1d42fe0f5f9c 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -591,10 +591,15 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 long notrace machine_check_early(struct pt_regs *regs)
 {
long handled = 0;
-   bool nested = in_nmi();
+   bool nested;
+   bool is_pseries_hpt_guest;
u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
 
this_cpu_set_ftrace_enabled(0);
+   is_pseries_hpt_guest = machine_is(pseries) &&
+  mmu_has_feature(MMU_FTR_HPTE_TABLE);
+   /* Do not use nmi_enter/exit for pseries hpte guest */
+   nested = is_pseries_hpt_guest ? true : in_nmi();
 
if (!nested)
nmi_enter();
-- 
2.26.2



[PATCH 3/3] selftest/powerpc: Add slb multihit selftest

2020-09-16 Thread Ganesh Goudar
Add selftest to check if the system recovers from slb multihit
errors.

Signed-off-by: Ganesh Goudar 
---
 tools/testing/selftests/powerpc/Makefile | 3 ++-
 tools/testing/selftests/powerpc/mces/Makefile| 6 ++
 tools/testing/selftests/powerpc/mces/slb_multihit.sh | 9 +
 3 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mces/Makefile
 create mode 100755 tools/testing/selftests/powerpc/mces/slb_multihit.sh

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 0830e63818c1..3c900b30da79 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -31,7 +31,8 @@ SUB_DIRS = alignment  \
   vphn \
   math \
   ptrace   \
-  security
+  security \
+  mces
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/mces/Makefile 
b/tools/testing/selftests/powerpc/mces/Makefile
new file mode 100644
index ..5a356295e952
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mces/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for machine check exceptions selftests
+
+TEST_PROGS := slb_multihit.sh
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/mces/slb_multihit.sh 
b/tools/testing/selftests/powerpc/mces/slb_multihit.sh
new file mode 100755
index ..35c17c619d0a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mces/slb_multihit.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+if [ ! -e "/sys/kernel/debug/powerpc/mce_error_inject/inject_slb_multihit" ] ; 
then
+exit 0;
+fi
+
+echo 1 > /sys/kernel/debug/powerpc/mce_error_inject/inject_slb_multihit
+exit 0
-- 
2.26.2



[PATCH 0/3] powerpc/mce: Fix mce handler and add selftest

2020-09-16 Thread Ganesh Goudar
This patch series fixes mce handling for pseries, provides debugfs
interface for mce injection and adds selftest to test mce handling
on pseries/powernv machines running in hash mmu mode.
debugfs interface and sleftest are added only for slb multihit
injection, We can add other tests in future if possible.

Ganesh Goudar (3):
  powerpc/mce: remove nmi_enter/exit from real mode handler
  powerpc/mce: Add debugfs interface to inject MCE
  selftest/powerpc: Add slb multihit selftest

 arch/powerpc/Kconfig.debug|   9 ++
 arch/powerpc/kernel/mce.c |   7 +-
 arch/powerpc/sysdev/Makefile  |   2 +
 arch/powerpc/sysdev/mce_error_inject.c| 149 ++
 tools/testing/selftests/powerpc/Makefile  |   3 +-
 tools/testing/selftests/powerpc/mces/Makefile |   6 +
 .../selftests/powerpc/mces/slb_multihit.sh|   9 ++
 7 files changed, 183 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/sysdev/mce_error_inject.c
 create mode 100644 tools/testing/selftests/powerpc/mces/Makefile
 create mode 100755 tools/testing/selftests/powerpc/mces/slb_multihit.sh

-- 
2.26.2



[PATCH 2/3] powerpc/mce: Add debugfs interface to inject MCE

2020-09-16 Thread Ganesh Goudar
To test machine check handling, add debugfs interface to inject
slb multihit errors.

To inject slb multihit:
 #echo 1 > /sys/kernel/debug/powerpc/mce_error_inject/inject_slb_multihit

Signed-off-by: Ganesh Goudar 
Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/Kconfig.debug |   9 ++
 arch/powerpc/sysdev/Makefile   |   2 +
 arch/powerpc/sysdev/mce_error_inject.c | 148 +
 3 files changed, 159 insertions(+)
 create mode 100644 arch/powerpc/sysdev/mce_error_inject.c

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index b88900f4832f..61db133f2f0d 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -398,3 +398,12 @@ config KASAN_SHADOW_OFFSET
hex
depends on KASAN
default 0xe000
+
+config MCE_ERROR_INJECT
+   bool "Enable MCE error injection through debugfs"
+   depends on DEBUG_FS
+   default y
+   help
+ This option creates an mce_error_inject directory in the
+ powerpc debugfs directory that allows limited injection of
+ Machine Check Errors (MCEs).
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index 026b3f01a991..7fc10b77 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -52,3 +52,5 @@ obj-$(CONFIG_PPC_XICS)+= xics/
 obj-$(CONFIG_PPC_XIVE) += xive/
 
 obj-$(CONFIG_GE_FPGA)  += ge/
+
+obj-$(CONFIG_MCE_ERROR_INJECT) += mce_error_inject.o
diff --git a/arch/powerpc/sysdev/mce_error_inject.c 
b/arch/powerpc/sysdev/mce_error_inject.c
new file mode 100644
index ..ca4726bfa2d9
--- /dev/null
+++ b/arch/powerpc/sysdev/mce_error_inject.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Machine Check Exception injection code
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static inline unsigned long get_slb_index(void)
+{
+   unsigned long index;
+
+   index = get_paca()->stab_rr;
+
+   /*
+* simple round-robin replacement of slb starting at SLB_NUM_BOLTED.
+*/
+   if (index < (mmu_slb_size - 1))
+   index++;
+   else
+   index = SLB_NUM_BOLTED;
+   get_paca()->stab_rr = index;
+   return index;
+}
+
+#define slb_esid_mask(ssize)   \
+   (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T)
+
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+unsigned long slot)
+{
+   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
+}
+
+#define slb_vsid_shift(ssize)  \
+   ((ssize) == MMU_SEGSIZE_256M ? SLB_VSID_SHIFT : SLB_VSID_SHIFT_1T)
+
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+unsigned long flags)
+{
+   return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags |
+   ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+static void insert_slb_entry(char *p, int ssize)
+{
+   unsigned long flags, entry;
+   struct paca_struct *paca;
+
+   flags = SLB_VSID_KERNEL | mmu_psize_defs[MMU_PAGE_64K].sllp;
+
+   preempt_disable();
+
+   paca = get_paca();
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+   preempt_enable();
+   p[0] = '!';
+}
+
+static void inject_vmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = vmalloc(2048);
+   if (!p)
+   return;
+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+   vfree(p);
+}
+
+static void inject_kmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = kmalloc(2048, GFP_KERNEL);
+   if (!p)
+   return;
+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+   kfree(p);
+}
+
+static ssize_t inject_slb_multihit(const char __user *u_buf, size_t count)
+{
+   char buf[32];
+   size_t buf_size;
+
+   buf_size = min(count, (sizeof(buf) - 1));
+   if (copy_from_user(buf, u_buf, buf_size))
+   return -EFAULT;
+   buf[buf_size] = '\0';
+
+   if (buf[0] != '1')
+   return -EINVAL;
+
+   inject_vmalloc_slb_multihit();
+   inject_kmalloc_slb_multihit();
+   return count;
+}
+
+static ssize_t inject_write(struct file *file, const char __user *buf,
+   size_t count, loff_t *ppos)
+{
+

[PATCH v2 3/3] selftests/lkdtm: Enable selftest for SLB multihit

2020-09-25 Thread Ganesh Goudar
Add PPC_SLB_MULTIHIT to lkdtm selftest framework.

Signed-off-by: Ganesh Goudar 
---
 tools/testing/selftests/lkdtm/tests.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/lkdtm/tests.txt 
b/tools/testing/selftests/lkdtm/tests.txt
index 9d266e79c6a2..7eb3cf91c89e 100644
--- a/tools/testing/selftests/lkdtm/tests.txt
+++ b/tools/testing/selftests/lkdtm/tests.txt
@@ -70,3 +70,4 @@ USERCOPY_KERNEL
 USERCOPY_KERNEL_DS
 STACKLEAK_ERASING OK: the rest of the thread stack is properly erased
 CFI_FORWARD_PROTO
+PPC_SLB_MULTIHIT Recovered
-- 
2.26.2



[PATCH v2 2/3] lkdtm/powerpc: Add SLB multihit test

2020-09-25 Thread Ganesh Goudar
Add support to inject slb multihit errors, to test machine
check handling.

Based on work by Mahesh Salgaonkar and Michal Suchánek.

Cc: Mahesh Salgaonkar 
Cc: Michal Suchánek 
Signed-off-by: Ganesh Goudar 
---
 drivers/misc/lkdtm/Makefile  |   4 ++
 drivers/misc/lkdtm/core.c|   3 +
 drivers/misc/lkdtm/lkdtm.h   |   3 +
 drivers/misc/lkdtm/powerpc.c | 132 +++
 4 files changed, 142 insertions(+)
 create mode 100644 drivers/misc/lkdtm/powerpc.c

diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile
index c70b3822013f..6a82f407fbcd 100644
--- a/drivers/misc/lkdtm/Makefile
+++ b/drivers/misc/lkdtm/Makefile
@@ -11,6 +11,10 @@ lkdtm-$(CONFIG_LKDTM)+= usercopy.o
 lkdtm-$(CONFIG_LKDTM)  += stackleak.o
 lkdtm-$(CONFIG_LKDTM)  += cfi.o
 
+ifeq ($(CONFIG_PPC64),y)
+lkdtm-$(CONFIG_LKDTM)  += powerpc.o
+endif
+
 KASAN_SANITIZE_stackleak.o := n
 KCOV_INSTRUMENT_rodata.o   := n
 
diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index a5e344df9166..8d5db42baa90 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -178,6 +178,9 @@ static const struct crashtype crashtypes[] = {
 #ifdef CONFIG_X86_32
CRASHTYPE(DOUBLE_FAULT),
 #endif
+#ifdef CONFIG_PPC64
+   CRASHTYPE(PPC_SLB_MULTIHIT),
+#endif
 };
 
 
diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
index 8878538b2c13..b305bd511ee5 100644
--- a/drivers/misc/lkdtm/lkdtm.h
+++ b/drivers/misc/lkdtm/lkdtm.h
@@ -104,4 +104,7 @@ void lkdtm_STACKLEAK_ERASING(void);
 /* cfi.c */
 void lkdtm_CFI_FORWARD_PROTO(void);
 
+/* powerpc.c */
+void lkdtm_PPC_SLB_MULTIHIT(void);
+
 #endif
diff --git a/drivers/misc/lkdtm/powerpc.c b/drivers/misc/lkdtm/powerpc.c
new file mode 100644
index ..d6db18444757
--- /dev/null
+++ b/drivers/misc/lkdtm/powerpc.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+
+static inline unsigned long get_slb_index(void)
+{
+   unsigned long index;
+
+   index = get_paca()->stab_rr;
+
+   /*
+* simple round-robin replacement of slb starting at SLB_NUM_BOLTED.
+*/
+   if (index < (mmu_slb_size - 1))
+   index++;
+   else
+   index = SLB_NUM_BOLTED;
+   get_paca()->stab_rr = index;
+   return index;
+}
+
+#define slb_esid_mask(ssize)   \
+   (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T)
+
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+unsigned long slot)
+{
+   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
+}
+
+#define slb_vsid_shift(ssize)  \
+   ((ssize) == MMU_SEGSIZE_256M ? SLB_VSID_SHIFT : SLB_VSID_SHIFT_1T)
+
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+unsigned long flags)
+{
+   return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags |
+   ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+static void insert_slb_entry(char *p, int ssize)
+{
+   unsigned long flags, entry;
+
+   flags = SLB_VSID_KERNEL | mmu_psize_defs[MMU_PAGE_64K].sllp;
+   preempt_disable();
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+   preempt_enable();
+   p[0] = '!';
+}
+
+static void inject_vmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = vmalloc(2048);
+   if (!p)
+   return;
+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+   vfree(p);
+}
+
+static void inject_kmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = kmalloc(2048, GFP_KERNEL);
+   if (!p)
+   return;
+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+   kfree(p);
+}
+
+static void insert_dup_slb_entry_0(void)
+{
+   unsigned long test_address = 0xC000;
+   volatile unsigned long *test_ptr;
+   unsigned long entry, i = 0;
+   unsigned long esid, vsid;
+
+   test_ptr = (unsigned long *)test_address;
+   preempt_disable();
+
+   asm volatile("slbmfee  %0,%1" : "=r" (esid) : "r" (i));
+   asm volatile("slbmfev  %0,%1" : "=r" (vsid) : "r" (i));
+   entry = get_slb_index();
+
+   /* for i !=0 we would need to mask out the old entry number */
+   asm volatile("slbmte %

[PATCH v2 1/3] powerpc/mce: remove nmi_enter/exit from real mode handler

2020-09-25 Thread Ganesh Goudar
Use of nmi_enter/exit in real mode handler causes the kernel to panic
and reboot on injecting slb mutihit on pseries machine running in hash
mmu mode, As these calls try to accesses memory outside RMO region in
real mode handler where translation is disabled.

Add check to not to use these calls on pseries machine running in hash
mmu mode.

Fixes: 116ac378bb3f ("powerpc/64s: machine check interrupt update NMI 
accounting")
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index ada59f6c4298..3bf39dd5dd43 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -591,12 +591,14 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 long notrace machine_check_early(struct pt_regs *regs)
 {
long handled = 0;
-   bool nested = in_nmi();
+   bool is_pseries_hpt_guest;
u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
 
this_cpu_set_ftrace_enabled(0);
-
-   if (!nested)
+   is_pseries_hpt_guest = machine_is(pseries) &&
+  mmu_has_feature(MMU_FTR_HPTE_TABLE);
+   /* Do not use nmi_enter/exit for pseries hpte guest */
+   if (!is_pseries_hpt_guest)
nmi_enter();
 
hv_nmi_check_nonrecoverable(regs);
@@ -607,7 +609,7 @@ long notrace machine_check_early(struct pt_regs *regs)
if (ppc_md.machine_check_early)
handled = ppc_md.machine_check_early(regs);
 
-   if (!nested)
+   if (!is_pseries_hpt_guest)
nmi_exit();
 
this_cpu_set_ftrace_enabled(ftrace_enabled);
-- 
2.26.2



[PATCH v2 0/3] powerpc/mce: Fix mce handler and add selftest

2020-09-25 Thread Ganesh Goudar
This patch series fixes mce handling for pseries, Adds LKDTM test
for SLB multihit recovery and enables selftest for the same,
basically to test MCE handling on pseries/powernv machines running
in hash mmu mode.

v2:
* Remove in_nmi check before calling nmi_enter/exit,
  as nesting is supported.
* Fix build errors and remove unused variables.
* Integrate error injection code into LKDTM.
* Add support to inject multihit in paca.

Ganesh Goudar (3):
  powerpc/mce: remove nmi_enter/exit from real mode handler
  lkdtm/powerpc: Add SLB multihit test
  selftests/lkdtm: Enable selftest for SLB multihit

 arch/powerpc/kernel/mce.c   |  10 +-
 drivers/misc/lkdtm/Makefile |   4 +
 drivers/misc/lkdtm/core.c   |   3 +
 drivers/misc/lkdtm/lkdtm.h  |   3 +
 drivers/misc/lkdtm/powerpc.c| 132 
 tools/testing/selftests/lkdtm/tests.txt |   1 +
 6 files changed, 149 insertions(+), 4 deletions(-)
 create mode 100644 drivers/misc/lkdtm/powerpc.c

-- 
2.26.2



[PATCH v3 0/2] powerpc/mce: Fix mce handler and add selftest

2020-10-01 Thread Ganesh Goudar
This patch series fixes mce handling for pseries, Adds LKDTM test
for SLB multihit recovery and enables selftest for the same,
basically to test MCE handling on pseries/powernv machines running
in hash mmu mode.

v3:
* Merging selftest changes with patch 2/2, Instead of having separate
  patch.
* Minor improvements like adding enough comments, Makefile changes,
  including header file and adding some prints.

v2:
* Remove in_nmi check before calling nmi_enter/exit,
  as nesting is supported.
* Fix build errors and remove unused variables.
* Integrate error injection code into LKDTM.
* Add support to inject multihit in paca.

Ganesh Goudar (2):
  powerpc/mce: remove nmi_enter/exit from real mode handler
  lkdtm/powerpc: Add SLB multihit test

 arch/powerpc/kernel/mce.c   |  10 +-
 drivers/misc/lkdtm/Makefile |   1 +
 drivers/misc/lkdtm/core.c   |   3 +
 drivers/misc/lkdtm/lkdtm.h  |   3 +
 drivers/misc/lkdtm/powerpc.c| 156 
 tools/testing/selftests/lkdtm/tests.txt |   1 +
 6 files changed, 170 insertions(+), 4 deletions(-)
 create mode 100644 drivers/misc/lkdtm/powerpc.c

-- 
2.26.2



[PATCH v3 1/2] powerpc/mce: remove nmi_enter/exit from real mode handler

2020-10-01 Thread Ganesh Goudar
Use of nmi_enter/exit in real mode handler causes the kernel to panic
and reboot on injecting slb mutihit on pseries machine running in hash
mmu mode, As these calls try to accesses memory outside RMO region in
real mode handler where translation is disabled.

Add check to not to use these calls on pseries machine running in hash
mmu mode.

Fixes: 116ac378bb3f ("powerpc/64s: machine check interrupt update NMI 
accounting")
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index ada59f6c4298..3bf39dd5dd43 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -591,12 +591,14 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 long notrace machine_check_early(struct pt_regs *regs)
 {
long handled = 0;
-   bool nested = in_nmi();
+   bool is_pseries_hpt_guest;
u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
 
this_cpu_set_ftrace_enabled(0);
-
-   if (!nested)
+   is_pseries_hpt_guest = machine_is(pseries) &&
+  mmu_has_feature(MMU_FTR_HPTE_TABLE);
+   /* Do not use nmi_enter/exit for pseries hpte guest */
+   if (!is_pseries_hpt_guest)
nmi_enter();
 
hv_nmi_check_nonrecoverable(regs);
@@ -607,7 +609,7 @@ long notrace machine_check_early(struct pt_regs *regs)
if (ppc_md.machine_check_early)
handled = ppc_md.machine_check_early(regs);
 
-   if (!nested)
+   if (!is_pseries_hpt_guest)
nmi_exit();
 
this_cpu_set_ftrace_enabled(ftrace_enabled);
-- 
2.26.2



[PATCH v3 2/2] lkdtm/powerpc: Add SLB multihit test

2020-10-01 Thread Ganesh Goudar
To check machine check handling, add support to inject slb
multihit errors.

Reviewed-by: Michal Suchánek 
Co-developed-by: Mahesh Salgaonkar 
Signed-off-by: Mahesh Salgaonkar 
Signed-off-by: Ganesh Goudar 
---
 drivers/misc/lkdtm/Makefile |   1 +
 drivers/misc/lkdtm/core.c   |   3 +
 drivers/misc/lkdtm/lkdtm.h  |   3 +
 drivers/misc/lkdtm/powerpc.c| 156 
 tools/testing/selftests/lkdtm/tests.txt |   1 +
 5 files changed, 164 insertions(+)
 create mode 100644 drivers/misc/lkdtm/powerpc.c

diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile
index c70b3822013f..f37ecfb0a707 100644
--- a/drivers/misc/lkdtm/Makefile
+++ b/drivers/misc/lkdtm/Makefile
@@ -10,6 +10,7 @@ lkdtm-$(CONFIG_LKDTM) += rodata_objcopy.o
 lkdtm-$(CONFIG_LKDTM)  += usercopy.o
 lkdtm-$(CONFIG_LKDTM)  += stackleak.o
 lkdtm-$(CONFIG_LKDTM)  += cfi.o
+lkdtm-$(CONFIG_PPC64)  += powerpc.o
 
 KASAN_SANITIZE_stackleak.o := n
 KCOV_INSTRUMENT_rodata.o   := n
diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index a5e344df9166..8d5db42baa90 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -178,6 +178,9 @@ static const struct crashtype crashtypes[] = {
 #ifdef CONFIG_X86_32
CRASHTYPE(DOUBLE_FAULT),
 #endif
+#ifdef CONFIG_PPC64
+   CRASHTYPE(PPC_SLB_MULTIHIT),
+#endif
 };
 
 
diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
index 8878538b2c13..b305bd511ee5 100644
--- a/drivers/misc/lkdtm/lkdtm.h
+++ b/drivers/misc/lkdtm/lkdtm.h
@@ -104,4 +104,7 @@ void lkdtm_STACKLEAK_ERASING(void);
 /* cfi.c */
 void lkdtm_CFI_FORWARD_PROTO(void);
 
+/* powerpc.c */
+void lkdtm_PPC_SLB_MULTIHIT(void);
+
 #endif
diff --git a/drivers/misc/lkdtm/powerpc.c b/drivers/misc/lkdtm/powerpc.c
new file mode 100644
index ..033111f7d929
--- /dev/null
+++ b/drivers/misc/lkdtm/powerpc.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include "lkdtm.h"
+
+/* Gets index for new slb entry */
+static inline unsigned long get_slb_index(void)
+{
+   unsigned long index;
+
+   index = get_paca()->stab_rr;
+
+   /*
+* simple round-robin replacement of slb starting at SLB_NUM_BOLTED.
+*/
+   if (index < (mmu_slb_size - 1))
+   index++;
+   else
+   index = SLB_NUM_BOLTED;
+   get_paca()->stab_rr = index;
+   return index;
+}
+
+#define slb_esid_mask(ssize)   \
+   (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T)
+
+/* Form the operand for slbmte */
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+unsigned long slot)
+{
+   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
+}
+
+#define slb_vsid_shift(ssize)  \
+   ((ssize) == MMU_SEGSIZE_256M ? SLB_VSID_SHIFT : SLB_VSID_SHIFT_1T)
+
+/* Form the operand for slbmte */
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+unsigned long flags)
+{
+   return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags |
+   ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+/* Inserts new slb entry */
+static void insert_slb_entry(char *p, int ssize)
+{
+   unsigned long flags, entry;
+
+   flags = SLB_VSID_KERNEL | mmu_psize_defs[MMU_PAGE_64K].sllp;
+   preempt_disable();
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+   preempt_enable();
+   /*
+* This triggers exception, If handled correctly we must recover
+* from this error.
+*/
+   p[0] = '!';
+}
+
+/* Inject slb multihit on vmalloc-ed address i.e 0xD00... */
+static void inject_vmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = vmalloc(2048);
+   if (!p)
+   return;
+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+   vfree(p);
+}
+
+/* Inject slb multihit on kmalloc-ed address i.e 0xC00... */
+static void inject_kmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = kmalloc(2048, GFP_KERNEL);
+   if (!p)
+   return;
+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+   kfree(p);
+}
+
+/*
+ * Few initial SLB entries are bolted. Add a test to inject
+ * multihit in bolted entry 0.
+ */
+static void inse

[PATCH v4 0/2] powerpc/mce: Fix mce handler and add selftest

2020-10-08 Thread Ganesh Goudar
This patch series fixes mce handling for pseries, Adds LKDTM test
for SLB multihit recovery and enables selftest for the same,
basically to test MCE handling on pseries/powernv machines running
in hash mmu mode.

v4:
* Use radix_enabled() to check if its in Hash or Radix mode.
* Use FW_FEATURE_LPAR instead of machine_is_pseries().

v3:
* Merging selftest changes with patch 2/2, Instead of having separate
  patch.
* Minor improvements like adding enough comments, Makefile changes,
  including header file and adding some prints.

v2:
* Remove in_nmi check before calling nmi_enter/exit,
  as nesting is supported.
* Fix build errors and remove unused variables.
* Integrate error injection code into LKDTM.
* Add support to inject multihit in paca.


Ganesh Goudar (2):
  powerpc/mce: remove nmi_enter/exit from real mode handler
  lkdtm/powerpc: Add SLB multihit test

 arch/powerpc/kernel/mce.c   |   7 +-
 drivers/misc/lkdtm/Makefile |   1 +
 drivers/misc/lkdtm/core.c   |   3 +
 drivers/misc/lkdtm/lkdtm.h  |   3 +
 drivers/misc/lkdtm/powerpc.c| 156 
 tools/testing/selftests/lkdtm/tests.txt |   1 +
 6 files changed, 167 insertions(+), 4 deletions(-)
 create mode 100644 drivers/misc/lkdtm/powerpc.c

-- 
2.26.2



[PATCH v4 1/2] powerpc/mce: remove nmi_enter/exit from real mode handler

2020-10-08 Thread Ganesh Goudar
Use of nmi_enter/exit in real mode handler causes the kernel to panic
and reboot on injecting slb mutihit on pseries machine running in hash
mmu mode, As these calls try to accesses memory outside RMO region in
real mode handler where translation is disabled.

Add check to not to use these calls on pseries machine running in hash
mmu mode.

Fixes: 116ac378bb3f ("powerpc/64s: machine check interrupt update NMI 
accounting")
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index ada59f6c4298..63702c0badb9 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -591,12 +591,11 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 long notrace machine_check_early(struct pt_regs *regs)
 {
long handled = 0;
-   bool nested = in_nmi();
u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
 
this_cpu_set_ftrace_enabled(0);
-
-   if (!nested)
+   /* Do not use nmi_enter/exit for pseries hpte guest */
+   if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR))
nmi_enter();
 
hv_nmi_check_nonrecoverable(regs);
@@ -607,7 +606,7 @@ long notrace machine_check_early(struct pt_regs *regs)
if (ppc_md.machine_check_early)
handled = ppc_md.machine_check_early(regs);
 
-   if (!nested)
+   if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR))
nmi_exit();
 
this_cpu_set_ftrace_enabled(ftrace_enabled);
-- 
2.26.2



[PATCH v4 2/2] lkdtm/powerpc: Add SLB multihit test

2020-10-08 Thread Ganesh Goudar
To check machine check handling, add support to inject slb
multihit errors.

Cc: Kees Cook 
Reviewed-by: Michal Suchánek 
Co-developed-by: Mahesh Salgaonkar 
Signed-off-by: Mahesh Salgaonkar 
Signed-off-by: Ganesh Goudar 
---
 drivers/misc/lkdtm/Makefile |   1 +
 drivers/misc/lkdtm/core.c   |   3 +
 drivers/misc/lkdtm/lkdtm.h  |   3 +
 drivers/misc/lkdtm/powerpc.c| 156 
 tools/testing/selftests/lkdtm/tests.txt |   1 +
 5 files changed, 164 insertions(+)
 create mode 100644 drivers/misc/lkdtm/powerpc.c

diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile
index c70b3822013f..f37ecfb0a707 100644
--- a/drivers/misc/lkdtm/Makefile
+++ b/drivers/misc/lkdtm/Makefile
@@ -10,6 +10,7 @@ lkdtm-$(CONFIG_LKDTM) += rodata_objcopy.o
 lkdtm-$(CONFIG_LKDTM)  += usercopy.o
 lkdtm-$(CONFIG_LKDTM)  += stackleak.o
 lkdtm-$(CONFIG_LKDTM)  += cfi.o
+lkdtm-$(CONFIG_PPC64)  += powerpc.o
 
 KASAN_SANITIZE_stackleak.o := n
 KCOV_INSTRUMENT_rodata.o   := n
diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index a5e344df9166..8d5db42baa90 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -178,6 +178,9 @@ static const struct crashtype crashtypes[] = {
 #ifdef CONFIG_X86_32
CRASHTYPE(DOUBLE_FAULT),
 #endif
+#ifdef CONFIG_PPC64
+   CRASHTYPE(PPC_SLB_MULTIHIT),
+#endif
 };
 
 
diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
index 8878538b2c13..b305bd511ee5 100644
--- a/drivers/misc/lkdtm/lkdtm.h
+++ b/drivers/misc/lkdtm/lkdtm.h
@@ -104,4 +104,7 @@ void lkdtm_STACKLEAK_ERASING(void);
 /* cfi.c */
 void lkdtm_CFI_FORWARD_PROTO(void);
 
+/* powerpc.c */
+void lkdtm_PPC_SLB_MULTIHIT(void);
+
 #endif
diff --git a/drivers/misc/lkdtm/powerpc.c b/drivers/misc/lkdtm/powerpc.c
new file mode 100644
index ..f388b53dccba
--- /dev/null
+++ b/drivers/misc/lkdtm/powerpc.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "lkdtm.h"
+#include 
+#include 
+
+/* Gets index for new slb entry */
+static inline unsigned long get_slb_index(void)
+{
+   unsigned long index;
+
+   index = get_paca()->stab_rr;
+
+   /*
+* simple round-robin replacement of slb starting at SLB_NUM_BOLTED.
+*/
+   if (index < (mmu_slb_size - 1))
+   index++;
+   else
+   index = SLB_NUM_BOLTED;
+   get_paca()->stab_rr = index;
+   return index;
+}
+
+#define slb_esid_mask(ssize)   \
+   (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T)
+
+/* Form the operand for slbmte */
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+unsigned long slot)
+{
+   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
+}
+
+#define slb_vsid_shift(ssize)  \
+   ((ssize) == MMU_SEGSIZE_256M ? SLB_VSID_SHIFT : SLB_VSID_SHIFT_1T)
+
+/* Form the operand for slbmte */
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+unsigned long flags)
+{
+   return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags |
+   ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+/* Inserts new slb entry */
+static void insert_slb_entry(char *p, int ssize)
+{
+   unsigned long flags, entry;
+
+   flags = SLB_VSID_KERNEL | mmu_psize_defs[MMU_PAGE_64K].sllp;
+   preempt_disable();
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+
+   entry = get_slb_index();
+   asm volatile("slbmte %0,%1" :
+   : "r" (mk_vsid_data((unsigned long)p, ssize, flags)),
+ "r" (mk_esid_data((unsigned long)p, ssize, entry))
+   : "memory");
+   preempt_enable();
+   /*
+* This triggers exception, If handled correctly we must recover
+* from this error.
+*/
+   p[0] = '!';
+}
+
+/* Inject slb multihit on vmalloc-ed address i.e 0xD00... */
+static void inject_vmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = vmalloc(2048);
+   if (!p)
+   return;
+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+   vfree(p);
+}
+
+/* Inject slb multihit on kmalloc-ed address i.e 0xC00... */
+static void inject_kmalloc_slb_multihit(void)
+{
+   char *p;
+
+   p = kmalloc(2048, GFP_KERNEL);
+   if (!p)
+   return;
+
+   insert_slb_entry(p, MMU_SEGSIZE_1T);
+   kfree(p);
+}
+
+/*
+ * Few initial SLB entries are bolted. Add a test to inject
+ * multihit in bolted entry 0.
+ */
+static

[PATCH] powerpc/mce: log the error for all unrecoverable errors

2022-11-13 Thread Ganesh Goudar
machine_check_log_err() is not getting called for all
unrecoverable errors, And we are missing to log the error.

Raise irq work in save_mce_event() for unrecoverable errors,
So that we log the error from MCE event handling block in
timer handler.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 6c5d30fba766..a1cb2172eb7b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   /*
+* Raise irq work, So that we don't miss to log the error for
+* unrecoverable errors.
+*/
+   if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED)
+   mce_irq_work_queue();
+
if (!addr)
return;
 
@@ -235,7 +242,6 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   mce_irq_work_queue();
 }
 
 /*
-- 
2.37.1



[PATCH] powerpc/mce: save ignore_event flag unconditionally for UE

2021-04-06 Thread Ganesh Goudar
When we hit an UE while using machine check safe copy routines,
ignore_event flag is set and the event is ignored by mce handler,
And the flag is also saved for defered handling and printing of
mce event information, But as of now saving of this flag is done
on checking if the effective address is provided or physical address
is calculated, which is not right.

Save ignore_event flag regardless of whether the effective address is
provided or physical address is calculated.

Without this change following log is seen, when the event is to be
ignored.

[  512.971365] MCE: CPU1: machine check (Severe)  UE Load/Store [Recovered]
[  512.971509] MCE: CPU1: NIP: [c00b67c0] memcpy+0x40/0x90
[  512.971655] MCE: CPU1: Initiator CPU
[  512.971739] MCE: CPU1: Unknown
[  512.972209] MCE: CPU1: machine check (Severe)  UE Load/Store [Recovered]
[  512.972334] MCE: CPU1: NIP: [c00b6808] memcpy+0x88/0x90
[  512.972456] MCE: CPU1: Initiator CPU
[  512.972534] MCE: CPU1: Unknown

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 11f0cae086ed..db9363e131ce 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
 * Populate the mce error_type and type-specific error_type.
 */
mce_set_error_info(mce, mce_err);
+   if (mce->error_type == MCE_ERROR_TYPE_UE)
+   mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
if (!addr)
return;
@@ -159,7 +161,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   mce->u.ue_error.ignore_event = mce_err->ignore_event;
machine_check_ue_event(mce);
}
}
-- 
2.26.2



[PATCH] powerpc/pseries/mce: Fix a typo in error type assignment

2021-04-16 Thread Ganesh Goudar
The error type is ICACHE and DCACHE, for case MCE_ERROR_TYPE_ICACHE.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index f8b390a9d9fb..9d4ef65da7f3 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -699,7 +699,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case MC_ERROR_TYPE_I_CACHE:
-   mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
+   mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
case MC_ERROR_TYPE_UNKNOWN:
default:
-- 
2.26.2



[PATCH v2] powerpc/pseries: Avoid using addr_to_pfn in realmode

2020-07-09 Thread Ganesh Goudar
When an UE or memory error exception is encountered the MCE handler
tries to find the pfn using addr_to_pfn() which takes effective
address as an argument, later pfn is used to poison the page where
memory error occurred, recent rework in this area made addr_to_pfn
to run in realmode, which can be fatal as it may try to access
memory outside RMO region.

To fix this use addr_to_pfn after switching to virtual mode.

Signed-off-by: Ganesh Goudar 
---
V2: Leave bare metal code and save_mce_event as is.
---
 arch/powerpc/platforms/pseries/ras.c | 20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index f3736fcd98fc..def875815e92 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -610,16 +610,8 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
 
-   if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
+   if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED)
paddr = be64_to_cpu(mce_log->logical_address);
-   } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
-   unsigned long pfn;
-
-   pfn = addr_to_pfn(regs, eaddr);
-   if (pfn != ULONG_MAX)
-   paddr = pfn << PAGE_SHIFT;
-   }
-
break;
case MC_ERROR_TYPE_SLB:
mce_err.error_type = MCE_ERROR_TYPE_SLB;
@@ -725,6 +717,16 @@ static int mce_handle_error(struct pt_regs *regs, struct 
rtas_error_log *errp)
 *   SLB multihit is done by now.
 */
mtmsr(mfmsr() | MSR_IR | MSR_DR);
+
+   /* Use addr_to_pfn after switching to virtual mode */
+   if (!paddr && error_type == MC_ERROR_TYPE_UE &&
+   mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
+   unsigned long pfn;
+
+   pfn = addr_to_pfn(regs, eaddr);
+   if (pfn != ULONG_MAX)
+   paddr = pfn << PAGE_SHIFT;
+   }
save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
&mce_err, regs->nip, eaddr, paddr);
 
-- 
2.17.2



[PATCH v3] powerpc/pseries: Avoid using addr_to_pfn in realmode

2020-07-20 Thread Ganesh Goudar
When an UE or memory error exception is encountered the MCE handler
tries to find the pfn using addr_to_pfn() which takes effective
address as an argument, later pfn is used to poison the page where
memory error occurred, recent rework in this area made addr_to_pfn
to run in realmode, which can be fatal as it may try to access
memory outside RMO region.

To fix this have separate functions for realmode and virtual mode
handling and let addr_to_pfn to run in virtual mode.

Without this fix following kernel crash is seen on hitting UE.

[  485.128036] Oops: Kernel access of bad area, sig: 11 [#1]
[  485.128040] LE SMP NR_CPUS=2048 NUMA pSeries
[  485.128047] Modules linked in:
[  485.128067] CPU: 15 PID: 6536 Comm: insmod Kdump: loaded Tainted: G OE 5.7.0 
#22
[  485.128074] NIP:  c009b24c LR: c00398d8 CTR: c0cd57c0
[  485.128078] REGS: c3f1f970 TRAP: 0300   Tainted: G OE (5.7.0)
[  485.128082] MSR:  80001003   CR: 28008284  XER: 0001
[  485.128088] CFAR: c009b190 DAR: c001fab0 DSISR: 4000 
IRQMASK: 1
[  485.128088] GPR00: 0001 c3f1fbf0 c1634300 
b0fa0100
[  485.128088] GPR04: d222  fab0 
0022
[  485.128088] GPR08: c001fab0  c001fab0 
c3f1fc14
[  485.128088] GPR12: 0008 c3ff5880 d218 

[  485.128088] GPR16: ff20 fff1 fff2 
d21a1100
[  485.128088] GPR20: d220 c0015c893c50 c0d49b28 
c0015c893c50
[  485.128088] GPR24: d21a0d08 c14e5da8 d21a0818 
000a
[  485.128088] GPR28: 0008 000a c17e2970 
000a
[  485.128125] NIP [c009b24c] __find_linux_pte+0x11c/0x310
[  485.128130] LR [c00398d8] addr_to_pfn+0x138/0x170
[  485.128133] Call Trace:
[  485.128135] Instruction dump:
[  485.128138] 3929 7d4a3378 7c883c36 7d2907b4 794a1564 7d294038 794af082 
3900
[  485.128144] 79291f24 790af00e 78e70020 7d095214 <7c69502a> 2fa3 419e011c 
70690040
[  485.128152] ---[ end trace d34b27e29ae0e340 ]---

Signed-off-by: Ganesh Goudar 
---
V2: Leave bare metal code and save_mce_event as is.

V3: Have separate functions for realmode and virtual mode handling.
---
 arch/powerpc/platforms/pseries/ras.c | 119 ---
 1 file changed, 70 insertions(+), 49 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index f3736fcd98fc..32fe3fad86b8 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -522,18 +522,55 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
 }
 
+static int mce_handle_err_realmode(int disposition, u8 error_type)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (disposition == RTAS_DISP_NOT_RECOVERED) {
+   switch (error_type) {
+   caseMC_ERROR_TYPE_SLB:
+   caseMC_ERROR_TYPE_ERAT:
+   /*
+* Store the old slb content in paca before flushing.
+* Print this when we go to virtual mode.
+* There are chances that we may hit MCE again if there
+* is a parity error on the SLB entry we trying to read
+* for saving. Hence limit the slb saving to single
+* level of recursion.
+*/
+   if (local_paca->in_mce == 1)
+   slb_save_contents(local_paca->mce_faulty_slbs);
+   flush_and_reload_slb();
+   disposition = RTAS_DISP_FULLY_RECOVERED;
+   break;
+   default:
+   break;
+   }
+   } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+   /* Platform corrected itself but could be degraded */
+   pr_err("MCE: limited recovery, system may be degraded\n");
+   disposition = RTAS_DISP_FULLY_RECOVERED;
+   }
+#endif
+   return disposition;
+}
 
-static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
+static int mce_handle_err_virtmode(struct pt_regs *regs,
+  struct rtas_error_log *errp,
+  struct pseries_mc_errorlog *mce_log,
+  int disposition)
 {
struct mce_error_info mce_err = { 0 };
-   unsigned long eaddr = 0, paddr = 0;
-   struct pseries_errorlog *pseries_log;
-   struct pseries_mc_errorlog *mce_log;
-   int disposition = rtas_error_disposition(errp);
int initiator = rtas_error_initiator(errp);
int severity = rtas_error_severity(errp);
+   unsigned lon

[PATCH v4] powerpc/pseries: Avoid using addr_to_pfn in real mode

2020-07-23 Thread Ganesh Goudar
When an UE or memory error exception is encountered the MCE handler
tries to find the pfn using addr_to_pfn() which takes effective
address as an argument, later pfn is used to poison the page where
memory error occurred, recent rework in this area made addr_to_pfn
to run in real mode, which can be fatal as it may try to access
memory outside RMO region.

Have two helper functions to separate things to be done in real mode
and virtual mode without changing any functionality. This also fixes
the following error as the use of addr_to_pfn is now moved to virtual
mode.

Without this change following kernel crash is seen on hitting UE.

[  485.128036] Oops: Kernel access of bad area, sig: 11 [#1]
[  485.128040] LE SMP NR_CPUS=2048 NUMA pSeries
[  485.128047] Modules linked in:
[  485.128067] CPU: 15 PID: 6536 Comm: insmod Kdump: loaded Tainted: G OE 5.7.0 
#22
[  485.128074] NIP:  c009b24c LR: c00398d8 CTR: c0cd57c0
[  485.128078] REGS: c3f1f970 TRAP: 0300   Tainted: G OE (5.7.0)
[  485.128082] MSR:  80001003   CR: 28008284  XER: 0001
[  485.128088] CFAR: c009b190 DAR: c001fab0 DSISR: 4000 
IRQMASK: 1
[  485.128088] GPR00: 0001 c3f1fbf0 c1634300 
b0fa0100
[  485.128088] GPR04: d222  fab0 
0022
[  485.128088] GPR08: c001fab0  c001fab0 
c3f1fc14
[  485.128088] GPR12: 0008 c3ff5880 d218 

[  485.128088] GPR16: ff20 fff1 fff2 
d21a1100
[  485.128088] GPR20: d220 c0015c893c50 c0d49b28 
c0015c893c50
[  485.128088] GPR24: d21a0d08 c14e5da8 d21a0818 
000a
[  485.128088] GPR28: 0008 000a c17e2970 
000a
[  485.128125] NIP [c009b24c] __find_linux_pte+0x11c/0x310
[  485.128130] LR [c00398d8] addr_to_pfn+0x138/0x170
[  485.128133] Call Trace:
[  485.128135] Instruction dump:
[  485.128138] 3929 7d4a3378 7c883c36 7d2907b4 794a1564 7d294038 794af082 
3900
[  485.128144] 79291f24 790af00e 78e70020 7d095214 <7c69502a> 2fa3 419e011c 
70690040
[  485.128152] ---[ end trace d34b27e29ae0e340 ]---

Fixes: 9ca766f9891d ("powerpc/64s/pseries: machine check convert to use common 
event code")
Signed-off-by: Ganesh Goudar 
---
V2: Leave bare metal code and save_mce_event as is.

V3: Have separate functions for realmode and virtual mode handling.

V4: Fix build warning, rephrase commit message.
---
 arch/powerpc/platforms/pseries/ras.c | 118 ---
 1 file changed, 69 insertions(+), 49 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index f3736fcd98fc..c509e43bac23 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -522,18 +522,55 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
 }
 
+static int mce_handle_err_realmode(int disposition, u8 error_type)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (disposition == RTAS_DISP_NOT_RECOVERED) {
+   switch (error_type) {
+   caseMC_ERROR_TYPE_SLB:
+   caseMC_ERROR_TYPE_ERAT:
+   /*
+* Store the old slb content in paca before flushing.
+* Print this when we go to virtual mode.
+* There are chances that we may hit MCE again if there
+* is a parity error on the SLB entry we trying to read
+* for saving. Hence limit the slb saving to single
+* level of recursion.
+*/
+   if (local_paca->in_mce == 1)
+   slb_save_contents(local_paca->mce_faulty_slbs);
+   flush_and_reload_slb();
+   disposition = RTAS_DISP_FULLY_RECOVERED;
+   break;
+   default:
+   break;
+   }
+   } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+   /* Platform corrected itself but could be degraded */
+   pr_err("MCE: limited recovery, system may be degraded\n");
+   disposition = RTAS_DISP_FULLY_RECOVERED;
+   }
+#endif
+   return disposition;
+}
 
-static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
+static int mce_handle_err_virtmode(struct pt_regs *regs,
+  struct rtas_error_log *errp,
+  struct pseries_mc_errorlog *mce_log,
+  int disposition)
 {
struct mce_error_info mce_err = { 0 };
-   unsigned long eaddr = 0, paddr = 0;
-   stru

[PATCH] powerpc/mce: Fix access error in mce handler

2021-09-06 Thread Ganesh Goudar
We queue an irq work for deferred processing of mce event
in realmode mce handler, where translation is disabled.
Queuing of the work may result in accessing memory outside
RMO region, such access needs the translation to be enabled
for an LPAR running with hash mmu else the kernel crashes.

So enable the translation before queuing the work.

Without this change following trace is seen on injecting machine
check error in an LPAR running with hash mmu.

Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 5 PID: 1883 Comm: insmod Tainted: GOE 5.14.0-mce+ #137
NIP:  c0735d60 LR: c0318640 CTR: 
REGS: c0001ebff9a0 TRAP: 0300   Tainted: G   OE  (5.14.0-mce+)
MSR:  80001003   CR: 28008228  XER: 0001
CFAR: c031863c DAR: c0027fa8fe08 DSISR: 4000 IRQMASK: 0
GPR00: c03186d0 c0001ebffc40 c1b0df00 c16337e8
GPR04: c16337e8 c0027fa8fe08 0023 c16337f0
GPR08: 0023 c12ffe08  c00801460240
GPR12:  c0001ec9a900 c0002ac4bd00 
GPR16: 05a0 c008006b c008006b05a0 c0ff3068
GPR20: c0002ac4bbc0 0001 c0002ac4bbc0 c00801490298
GPR24: c00801490108 c1636198 c00801470090 c00801470058
GPR28: 0510 c0080100 c0080819 0019
NIP [c0735d60] llist_add_batch+0x0/0x40
LR [c0318640] __irq_work_queue_local+0x70/0xc0
Call Trace:
[c0001ebffc40] [c0001ebffc0c] 0xc0001ebffc0c (unreliable)
[c0001ebffc60] [c03186d0] irq_work_queue+0x40/0x70
[c0001ebffc80] [c004425c] machine_check_queue_event+0xbc/0xd0
[c0001ebffcf0] [c000838c] machine_check_early_common+0x16c/0x1f4

Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from 
handler")
Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..9d1e39d42e3e 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -249,6 +249,7 @@ void machine_check_queue_event(void)
 {
int index;
struct machine_check_event evt;
+   unsigned long msr;
 
if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
return;
@@ -262,8 +263,19 @@ void machine_check_queue_event(void)
memcpy(&local_paca->mce_info->mce_event_queue[index],
   &evt, sizeof(evt));
 
-   /* Queue irq work to process this event later. */
-   irq_work_queue(&mce_event_process_work);
+   /* Queue irq work to process this event later. Before
+* queuing the work enable translation for non radix LPAR,
+* as irq_work_queue may try to access memory outside RMO
+* region.
+*/
+   if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
+   msr = mfmsr();
+   mtmsr(msr | MSR_IR | MSR_DR);
+   irq_work_queue(&mce_event_process_work);
+   mtmsr(msr);
+   } else {
+   irq_work_queue(&mce_event_process_work);
+   }
 }
 
 void mce_common_process_ue(struct pt_regs *regs,
-- 
2.31.1



[PATCH v3 1/3] powerpc/pseries: Parse control memory access error

2021-09-06 Thread Ganesh Goudar
Add support to parse and log control memory access
error for pseries. These changes are made according to
PAPR v2.11 10.3.2.2.12.

Signed-off-by: Ganesh Goudar 
---
v3: Modify the commit log to mention the document according
to which changes are made.
Define and use a macro to check if the effective address
is provided.

v2: No changes.
---
 arch/powerpc/platforms/pseries/ras.c | 36 
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 56092dccfdb8..e62a0ca2611a 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -60,11 +60,17 @@ struct pseries_mc_errorlog {
 *  XX  2: Reserved.
 *XXX   3: Type of UE error.
 *
-* For error_type != MC_ERROR_TYPE_UE
+* For error_type == MC_ERROR_TYPE_SLB/ERAT/TLB
 *   
 *   X  1: Effective address provided.
 *X 5: Reserved.
 * XX   2: Type of SLB/ERAT/TLB error.
+*
+* For error_type == MC_ERROR_TYPE_CTRL_MEM_ACCESS
+*   
+*   X  1: Error causing address provided.
+*XXX   3: Type of error.
+*      4: Reserved.
 */
u8  sub_err_type;
u8  reserved_1[6];
@@ -80,6 +86,7 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TYPE_TLB  0x04
 #define MC_ERROR_TYPE_D_CACHE  0x05
 #define MC_ERROR_TYPE_I_CACHE  0x07
+#define MC_ERROR_TYPE_CTRL_MEM_ACCESS  0x08
 
 /* RTAS pseries MCE error sub types */
 #define MC_ERROR_UE_INDETERMINATE  0
@@ -90,6 +97,7 @@ struct pseries_mc_errorlog {
 
 #define UE_EFFECTIVE_ADDR_PROVIDED 0x40
 #define UE_LOGICAL_ADDR_PROVIDED   0x20
+#define MC_EFFECTIVE_ADDR_PROVIDED 0x80
 
 #define MC_ERROR_SLB_PARITY0
 #define MC_ERROR_SLB_MULTIHIT  1
@@ -103,6 +111,9 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TLB_MULTIHIT  2
 #define MC_ERROR_TLB_INDETERMINATE 3
 
+#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK   0
+#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1
+
 static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
 {
switch (mlog->error_type) {
@@ -112,6 +123,8 @@ static inline u8 rtas_mc_error_sub_type(const struct 
pseries_mc_errorlog *mlog)
caseMC_ERROR_TYPE_ERAT:
caseMC_ERROR_TYPE_TLB:
return (mlog->sub_err_type & 0x03);
+   caseMC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   return (mlog->sub_err_type & 0x70) >> 4;
default:
return 0;
}
@@ -656,7 +669,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_ERAT:
@@ -673,7 +686,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.erat_error_type = 
MCE_ERAT_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_TLB:
@@ -690,7 +703,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_D_CACHE:
@@ -699,6 +712,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
case MC_ERROR_TYPE_I_CACHE:
mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
+   case MC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   mce_err.error_type = MCE_ERROR_TYPE_RA;
+   switch (err_sub_type) {
+   case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
+   break;
+   case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_LOAD_STORE_FOREIGN;
+   break;
+ 

[PATCH v3 2/3] selftests/powerpc: Add test for real address error handling

2021-09-06 Thread Ganesh Goudar
Add test for real address or control memory address access
error handling, using NX-GZIP engine.

The error is injected by accessing the control memory address
using illegal instruction, on successful handling the process
attempting to access control memory address using illegal
instruction receives SIGBUS.

Signed-off-by: Ganesh Goudar 
---
v3: Avoid using shell script to inject error.

v2: Fix build error.
---
 tools/testing/selftests/powerpc/Makefile  |  3 +-
 tools/testing/selftests/powerpc/mce/Makefile  |  7 ++
 .../selftests/powerpc/mce/inject-ra-err.c | 65 +++
 tools/testing/selftests/powerpc/mce/vas-api.h |  1 +
 4 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mce/Makefile
 create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c
 create mode 12 tools/testing/selftests/powerpc/mce/vas-api.h

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 0830e63818c1..4830372d7416 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -31,7 +31,8 @@ SUB_DIRS = alignment  \
   vphn \
   math \
   ptrace   \
-  security
+  security \
+  mce
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/mce/Makefile 
b/tools/testing/selftests/powerpc/mce/Makefile
new file mode 100644
index ..2424513982d9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/Makefile
@@ -0,0 +1,7 @@
+#SPDX-License-Identifier: GPL-2.0-or-later
+
+TEST_GEN_PROGS := inject-ra-err
+
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
new file mode 100644
index ..94323c34d9a6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vas-api.h"
+#include "utils.h"
+
+static bool faulted;
+
+static void sigbus_handler(int n, siginfo_t *info, void *ctxt_v)
+{
+   ucontext_t *ctxt = (ucontext_t *)ctxt_v;
+   struct pt_regs *regs = ctxt->uc_mcontext.regs;
+
+   faulted = true;
+   regs->nip += 4;
+}
+
+static int test_ra_error(void)
+{
+   struct vas_tx_win_open_attr attr;
+   int fd, *paste_addr;
+   char *devname = "/dev/crypto/nx-gzip";
+   struct sigaction act = {
+   .sa_sigaction = sigbus_handler,
+   .sa_flags = SA_SIGINFO,
+   };
+
+   memset(&attr, 0, sizeof(attr));
+   attr.version = 1;
+   attr.vas_id = 0;
+
+   SKIP_IF(access(devname, F_OK));
+
+   fd = open(devname, O_RDWR);
+   FAIL_IF(fd < 0);
+   FAIL_IF(ioctl(fd, VAS_TX_WIN_OPEN, &attr) < 0);
+   FAIL_IF(sigaction(SIGBUS, &act, NULL) != 0);
+
+   paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 
0ULL);
+
+   /* The following assignment triggers exception */
+   mb();
+   *paste_addr = 1;
+   mb();
+
+   FAIL_IF(!faulted);
+
+   return 0;
+}
+
+int main(void)
+{
+   return test_harness(test_ra_error, "inject-ra-err");
+}
+
diff --git a/tools/testing/selftests/powerpc/mce/vas-api.h 
b/tools/testing/selftests/powerpc/mce/vas-api.h
new file mode 12
index ..1455c1bcd351
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/vas-api.h
@@ -0,0 +1 @@
+../../../../../arch/powerpc/include/uapi/asm/vas-api.h
\ No newline at end of file
-- 
2.31.1



[PATCH v3 3/3] powerpc/mce: Modify the real address error logging messages

2021-09-06 Thread Ganesh Goudar
To avoid ambiguity, modify the strings in real address error
logging messages to "foreign/control memory" from "foreign",
Since the error discriptions in P9 user manual and P10 user
manual are different for same type of errors.

P9 User Manual for MCE:
DSISR:59 Host real address to foreign space during translation.
DSISR:60 Host real address to foreign space on a load or store
 access.

P10 User Manual for MCE:
DSISR:59 D-side tablewalk used a host real address in the
 control memory address range.
DSISR:60 D-side operand access to control memory address space.

Signed-off-by: Ganesh Goudar 
---
v3: No changes.

v2: No changes.
---
 arch/powerpc/kernel/mce.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9d1e39d42e3e..5baf69503349 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -400,14 +400,14 @@ void machine_check_print_event_info(struct 
machine_check_event *evt,
static const char *mc_ra_types[] = {
"Indeterminate",
"Instruction fetch (bad)",
-   "Instruction fetch (foreign)",
+   "Instruction fetch (foreign/control memory)",
"Page table walk ifetch (bad)",
-   "Page table walk ifetch (foreign)",
+   "Page table walk ifetch (foreign/control memory)",
"Load (bad)",
"Store (bad)",
"Page table walk Load/Store (bad)",
-   "Page table walk Load/Store (foreign)",
-   "Load/Store (foreign)",
+   "Page table walk Load/Store (foreign/control memory)",
+   "Load/Store (foreign/control memory)",
};
static const char *mc_link_types[] = {
"Indeterminate",
-- 
2.31.1



[PATCH v2] powerpc/mce: Fix access error in mce handler

2021-09-08 Thread Ganesh Goudar
We queue an irq work for deferred processing of mce event
in realmode mce handler, where translation is disabled.
Queuing of the work may result in accessing memory outside
RMO region, such access needs the translation to be enabled
for an LPAR running with hash mmu else the kernel crashes.

After enabling translation in mce_handle_error() we used to
leave it enabled to avoid crashing here, but now with the
commit 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before
returning from handler") we are restoring the MSR to disable
translation.

Hence to fix this enable the translation before queuing the work.

Without this change following trace is seen on injecting SLB
multihit in an LPAR running with hash mmu.

Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 5 PID: 1883 Comm: insmod Tainted: GOE 5.14.0-mce+ #137
NIP:  c0735d60 LR: c0318640 CTR: 
REGS: c0001ebff9a0 TRAP: 0300   Tainted: G   OE  (5.14.0-mce+)
MSR:  80001003   CR: 28008228  XER: 0001
CFAR: c031863c DAR: c0027fa8fe08 DSISR: 4000 IRQMASK: 0
GPR00: c03186d0 c0001ebffc40 c1b0df00 c16337e8
GPR04: c16337e8 c0027fa8fe08 0023 c16337f0
GPR08: 0023 c12ffe08  c00801460240
GPR12:  c0001ec9a900 c0002ac4bd00 
GPR16: 05a0 c008006b c008006b05a0 c0ff3068
GPR20: c0002ac4bbc0 0001 c0002ac4bbc0 c00801490298
GPR24: c00801490108 c1636198 c00801470090 c00801470058
GPR28: 0510 c0080100 c0080819 0019
NIP [c0735d60] llist_add_batch+0x0/0x40
LR [c0318640] __irq_work_queue_local+0x70/0xc0
Call Trace:
[c0001ebffc40] [c0001ebffc0c] 0xc0001ebffc0c (unreliable)
[c0001ebffc60] [c03186d0] irq_work_queue+0x40/0x70
[c0001ebffc80] [c004425c] machine_check_queue_event+0xbc/0xd0
[c0001ebffcf0] [c000838c] machine_check_early_common+0x16c/0x1f4

Fixes: 74c3354bc1d89 ("powerpc/pseries/mce: restore msr before returning from 
handler")
Signed-off-by: Ganesh Goudar 
---
v2: Change in commit message.
---
 arch/powerpc/kernel/mce.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..9d1e39d42e3e 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -249,6 +249,7 @@ void machine_check_queue_event(void)
 {
int index;
struct machine_check_event evt;
+   unsigned long msr;
 
if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
return;
@@ -262,8 +263,19 @@ void machine_check_queue_event(void)
memcpy(&local_paca->mce_info->mce_event_queue[index],
   &evt, sizeof(evt));
 
-   /* Queue irq work to process this event later. */
-   irq_work_queue(&mce_event_process_work);
+   /* Queue irq work to process this event later. Before
+* queuing the work enable translation for non radix LPAR,
+* as irq_work_queue may try to access memory outside RMO
+* region.
+*/
+   if (!radix_enabled() && firmware_has_feature(FW_FEATURE_LPAR)) {
+   msr = mfmsr();
+   mtmsr(msr | MSR_IR | MSR_DR);
+   irq_work_queue(&mce_event_process_work);
+   mtmsr(msr);
+   } else {
+   irq_work_queue(&mce_event_process_work);
+   }
 }
 
 void mce_common_process_ue(struct pt_regs *regs,
-- 
2.31.1



[PATCH 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode

2021-11-08 Thread Ganesh Goudar
In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/machdep.h   |  2 +
 arch/powerpc/include/asm/mce.h   |  2 +
 arch/powerpc/include/asm/paca.h  |  1 +
 arch/powerpc/kernel/mce.c| 51 +++-
 arch/powerpc/kernel/time.c   |  3 ++
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c | 31 +-
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 764f2732a821..c89cc03c0f97 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -103,6 +103,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
 
+   void(*machine_check_log_err)(void);
+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..187810f13669 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+extern void machine_check_raise_dec_intr(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+void mce_run_late_handlers(void);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..f49180f8c9be 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   atomic_t mces_to_process;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..45baa062ebc0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
 }
 
+/* Raise decrementer interrupt */
+void machine_check_raise_dec_intr(void)
+{
+   set_dec(1);
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   atomic_inc(&local_paca->mces_to_process);
+
if (!addr)
return;
 
@@ -217,7 +215,7 @@ void release_mce_event(void)
get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
schedule_work(&mce_ue_event_work);
 }
@@ -239,7 +237,7 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   irq_work_queue(&mce_ue_event_irq_work);
+   machine_check_raise_dec_intr();
 }
 
 /*
@@ -249,7 +247,6 @@ void machine_check_queue_event(void)
 {
int index;
struct machine_check_event evt;
-   unsigned long msr;
 
if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
return;
@@ -263,20 +2

[PATCH 2/2] pseries/mce: Refactor the pseries mce handling code

2021-11-08 Thread Ganesh Goudar
Now that we are no longer switching on the mmu in realmode
mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
Avoid using addr_to_pfn in real mode") partially, which
introduced functions mce_handle_err_virtmode/realmode() to
separate mce handler code which needed translation to enabled.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 122 +++
 1 file changed, 49 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 8613f9cc5798..62e1519b8355 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
 }
 
-static int mce_handle_err_realmode(int disposition, u8 error_type)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-   if (disposition == RTAS_DISP_NOT_RECOVERED) {
-   switch (error_type) {
-   caseMC_ERROR_TYPE_ERAT:
-   flush_erat();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   caseMC_ERROR_TYPE_SLB:
-   /*
-* Store the old slb content in paca before flushing.
-* Print this when we go to virtual mode.
-* There are chances that we may hit MCE again if there
-* is a parity error on the SLB entry we trying to read
-* for saving. Hence limit the slb saving to single
-* level of recursion.
-*/
-   if (local_paca->in_mce == 1)
-   slb_save_contents(local_paca->mce_faulty_slbs);
-   flush_and_reload_slb();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   default:
-   break;
-   }
-   } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-   /* Platform corrected itself but could be degraded */
-   pr_err("MCE: limited recovery, system may be degraded\n");
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   }
-#endif
-   return disposition;
-}
-
-static int mce_handle_err_virtmode(struct pt_regs *regs,
-  struct rtas_error_log *errp,
-  struct pseries_mc_errorlog *mce_log,
-  int disposition)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 {
struct mce_error_info mce_err = { 0 };
+   unsigned long eaddr = 0, paddr = 0;
+   struct pseries_errorlog *pseries_log;
+   struct pseries_mc_errorlog *mce_log;
+   int disposition = rtas_error_disposition(errp);
int initiator = rtas_error_initiator(errp);
int severity = rtas_error_severity(errp);
-   unsigned long eaddr = 0, paddr = 0;
u8 error_type, err_sub_type;
 
-   if (!mce_log)
-   goto out;
-
-   error_type = mce_log->error_type;
-   err_sub_type = rtas_mc_error_sub_type(mce_log);
-
if (initiator == RTAS_INITIATOR_UNKNOWN)
mce_err.initiator = MCE_INITIATOR_UNKNOWN;
else if (initiator == RTAS_INITIATOR_CPU)
@@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.severity = MCE_SEV_SEVERE;
else if (severity == RTAS_SEVERITY_ERROR)
mce_err.severity = MCE_SEV_SEVERE;
+   else if (severity == RTAS_SEVERITY_FATAL)
+   mce_err.severity = MCE_SEV_FATAL;
else
mce_err.severity = MCE_SEV_FATAL;
 
@@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
mce_err.error_class = MCE_ECLASS_UNKNOWN;
 
-   switch (error_type) {
+   if (!rtas_error_extended(errp))
+   goto out;
+
+   pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+   if (!pseries_log)
+   goto out;
+
+   mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+   error_type = mce_log->error_type;
+   err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+   switch (mce_log->error_type) {
case MC_ERROR_TYPE_UE:
mce_err.error_type = MCE_ERROR_TYPE_UE;
mce_common_process_ue(regs, &mce_err);
@@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case MC_ERROR_TYPE_I_CACHE:
-   mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
+   mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
c

[PATCH v2 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode

2021-11-23 Thread Ganesh Goudar
In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar 
---
V2:
* Use arch_irq_work_raise to raise decrementer interrupt.
* Avoid having atomic variable.
---
 arch/powerpc/include/asm/machdep.h   |  2 +
 arch/powerpc/include/asm/mce.h   |  2 +
 arch/powerpc/include/asm/paca.h  |  1 +
 arch/powerpc/kernel/mce.c| 51 +++-
 arch/powerpc/kernel/time.c   |  2 +
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c | 31 +-
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 33 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 9c3c9f04129f..d22b222ba471 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -99,6 +99,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
 
+   void(*machine_check_log_err)(void);
+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..6e306aaf58aa 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+void machine_check_raise_dec_intr(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+void mce_run_late_handlers(void);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..d463c796f7fa 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   u32 mces_to_process;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..8e17f29472a0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
 }
 
+/* Raise decrementer interrupt */
+void machine_check_raise_dec_intr(void)
+{
+   arch_irq_work_raise();
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   local_paca->mces_to_process++;
+
if (!addr)
return;
 
@@ -217,7 +215,7 @@ void release_mce_event(void)
get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
schedule_work(&mce_ue_event_work);
 }
@@ -239,7 +237,7 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   irq_work_queue(&mce_ue_event_irq_work);
+   machine_check_raise_dec_intr();
 }
 
 /*
@@ -249,7 +247,6 @@ void machine_check_queue_event(void)
 {
int index;
struct machine_check_event evt;
-   unsigned long msr;
 
 

[PATCH v2 2/2] pseries/mce: Refactor the pseries mce handling code

2021-11-23 Thread Ganesh Goudar
Now that we are no longer switching on the mmu in realmode
mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
Avoid using addr_to_pfn in real mode") partially, which
introduced functions mce_handle_err_virtmode/realmode() to
separate mce handler code which needed translation to enabled.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 122 +++
 1 file changed, 49 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 8613f9cc5798..62e1519b8355 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
 }
 
-static int mce_handle_err_realmode(int disposition, u8 error_type)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-   if (disposition == RTAS_DISP_NOT_RECOVERED) {
-   switch (error_type) {
-   caseMC_ERROR_TYPE_ERAT:
-   flush_erat();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   caseMC_ERROR_TYPE_SLB:
-   /*
-* Store the old slb content in paca before flushing.
-* Print this when we go to virtual mode.
-* There are chances that we may hit MCE again if there
-* is a parity error on the SLB entry we trying to read
-* for saving. Hence limit the slb saving to single
-* level of recursion.
-*/
-   if (local_paca->in_mce == 1)
-   slb_save_contents(local_paca->mce_faulty_slbs);
-   flush_and_reload_slb();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   default:
-   break;
-   }
-   } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-   /* Platform corrected itself but could be degraded */
-   pr_err("MCE: limited recovery, system may be degraded\n");
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   }
-#endif
-   return disposition;
-}
-
-static int mce_handle_err_virtmode(struct pt_regs *regs,
-  struct rtas_error_log *errp,
-  struct pseries_mc_errorlog *mce_log,
-  int disposition)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 {
struct mce_error_info mce_err = { 0 };
+   unsigned long eaddr = 0, paddr = 0;
+   struct pseries_errorlog *pseries_log;
+   struct pseries_mc_errorlog *mce_log;
+   int disposition = rtas_error_disposition(errp);
int initiator = rtas_error_initiator(errp);
int severity = rtas_error_severity(errp);
-   unsigned long eaddr = 0, paddr = 0;
u8 error_type, err_sub_type;
 
-   if (!mce_log)
-   goto out;
-
-   error_type = mce_log->error_type;
-   err_sub_type = rtas_mc_error_sub_type(mce_log);
-
if (initiator == RTAS_INITIATOR_UNKNOWN)
mce_err.initiator = MCE_INITIATOR_UNKNOWN;
else if (initiator == RTAS_INITIATOR_CPU)
@@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.severity = MCE_SEV_SEVERE;
else if (severity == RTAS_SEVERITY_ERROR)
mce_err.severity = MCE_SEV_SEVERE;
+   else if (severity == RTAS_SEVERITY_FATAL)
+   mce_err.severity = MCE_SEV_FATAL;
else
mce_err.severity = MCE_SEV_FATAL;
 
@@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
mce_err.error_class = MCE_ECLASS_UNKNOWN;
 
-   switch (error_type) {
+   if (!rtas_error_extended(errp))
+   goto out;
+
+   pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+   if (!pseries_log)
+   goto out;
+
+   mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+   error_type = mce_log->error_type;
+   err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+   switch (mce_log->error_type) {
case MC_ERROR_TYPE_UE:
mce_err.error_type = MCE_ERROR_TYPE_UE;
mce_common_process_ue(regs, &mce_err);
@@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case MC_ERROR_TYPE_I_CACHE:
-   mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
+   mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
c

[PATCH v3 2/2] pseries/mce: Refactor the pseries mce handling code

2021-11-24 Thread Ganesh Goudar
Now that we are no longer switching on the mmu in realmode
mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
Avoid using addr_to_pfn in real mode") partially, which
introduced functions mce_handle_err_virtmode/realmode() to
separate mce handler code which needed translation to enabled.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 122 +++
 1 file changed, 49 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 8613f9cc5798..62e1519b8355 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
 }
 
-static int mce_handle_err_realmode(int disposition, u8 error_type)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-   if (disposition == RTAS_DISP_NOT_RECOVERED) {
-   switch (error_type) {
-   caseMC_ERROR_TYPE_ERAT:
-   flush_erat();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   caseMC_ERROR_TYPE_SLB:
-   /*
-* Store the old slb content in paca before flushing.
-* Print this when we go to virtual mode.
-* There are chances that we may hit MCE again if there
-* is a parity error on the SLB entry we trying to read
-* for saving. Hence limit the slb saving to single
-* level of recursion.
-*/
-   if (local_paca->in_mce == 1)
-   slb_save_contents(local_paca->mce_faulty_slbs);
-   flush_and_reload_slb();
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   break;
-   default:
-   break;
-   }
-   } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-   /* Platform corrected itself but could be degraded */
-   pr_err("MCE: limited recovery, system may be degraded\n");
-   disposition = RTAS_DISP_FULLY_RECOVERED;
-   }
-#endif
-   return disposition;
-}
-
-static int mce_handle_err_virtmode(struct pt_regs *regs,
-  struct rtas_error_log *errp,
-  struct pseries_mc_errorlog *mce_log,
-  int disposition)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 {
struct mce_error_info mce_err = { 0 };
+   unsigned long eaddr = 0, paddr = 0;
+   struct pseries_errorlog *pseries_log;
+   struct pseries_mc_errorlog *mce_log;
+   int disposition = rtas_error_disposition(errp);
int initiator = rtas_error_initiator(errp);
int severity = rtas_error_severity(errp);
-   unsigned long eaddr = 0, paddr = 0;
u8 error_type, err_sub_type;
 
-   if (!mce_log)
-   goto out;
-
-   error_type = mce_log->error_type;
-   err_sub_type = rtas_mc_error_sub_type(mce_log);
-
if (initiator == RTAS_INITIATOR_UNKNOWN)
mce_err.initiator = MCE_INITIATOR_UNKNOWN;
else if (initiator == RTAS_INITIATOR_CPU)
@@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.severity = MCE_SEV_SEVERE;
else if (severity == RTAS_SEVERITY_ERROR)
mce_err.severity = MCE_SEV_SEVERE;
+   else if (severity == RTAS_SEVERITY_FATAL)
+   mce_err.severity = MCE_SEV_FATAL;
else
mce_err.severity = MCE_SEV_FATAL;
 
@@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
mce_err.error_class = MCE_ECLASS_UNKNOWN;
 
-   switch (error_type) {
+   if (!rtas_error_extended(errp))
+   goto out;
+
+   pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+   if (!pseries_log)
+   goto out;
+
+   mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+   error_type = mce_log->error_type;
+   err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+   switch (mce_log->error_type) {
case MC_ERROR_TYPE_UE:
mce_err.error_type = MCE_ERROR_TYPE_UE;
mce_common_process_ue(regs, &mce_err);
@@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
case MC_ERROR_TYPE_I_CACHE:
-   mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
+   mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
break;
c

[PATCH v3 1/2] powerpc/mce: Avoid using irq_work_queue() in realmode

2021-11-24 Thread Ganesh Goudar
In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar 
---
V2:
* Use arch_irq_work_raise to raise decrementer interrupt.
* Avoid having atomic variable.

V3:
* Fix build error.
  Reported by kernel test bot.
---
 arch/powerpc/include/asm/machdep.h   |  2 +
 arch/powerpc/include/asm/mce.h   |  2 +
 arch/powerpc/include/asm/paca.h  |  1 +
 arch/powerpc/kernel/mce.c| 51 +++-
 arch/powerpc/kernel/time.c   |  3 ++
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c | 31 +-
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 9c3c9f04129f..d22b222ba471 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -99,6 +99,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
 
+   void(*machine_check_log_err)(void);
+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..6e306aaf58aa 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,10 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+void machine_check_raise_dec_intr(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+void mce_run_late_handlers(void);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..d463c796f7fa 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   u32 mces_to_process;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..8e17f29472a0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,12 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
 }
 
+/* Raise decrementer interrupt */
+void machine_check_raise_dec_intr(void)
+{
+   arch_irq_work_raise();
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -135,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   local_paca->mces_to_process++;
+
if (!addr)
return;
 
@@ -217,7 +215,7 @@ void release_mce_event(void)
get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
schedule_work(&mce_ue_event_work);
 }
@@ -239,7 +237,7 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   irq_work_queue(&mce_ue_event_irq_work);
+   machine_check_raise_dec_intr();
 }
 
 /*
@@ -249,7 +247,6 @@ void machine_check_queue_event(void)
 {
int index;
struct machine_check_ev

[PATCH v5] lkdtm/powerpc: Add SLB multihit test

2020-11-30 Thread Ganesh Goudar
To check machine check handling, add support to inject slb
multihit errors.

Cc: Kees Cook 
Cc: Michal Suchánek 
Co-developed-by: Mahesh Salgaonkar 
Signed-off-by: Mahesh Salgaonkar 
Signed-off-by: Ganesh Goudar 
---
v5:
- Insert entries at SLB_NUM_BOLTED and SLB_NUM_BOLTED +1, remove index
  allocating helper function.
- Move mk_esid_data and mk_vsid_data helpers to asm/book3s/64/mmu-hash.h.
- Use mmu_linear_psize and mmu_vmalloc_psize to get page size.
- Use !radix_enabled() to check if we are in HASH mode.
- And other minor improvements.

v1-v4:
- No major changes here for this patch, This patch was initially posted
  along with the other patch which got accepted.
https://git.kernel.org/powerpc/c/8d0e2101274358d9b6b1f27232b40253ca48bab5
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  28 +++-
 arch/powerpc/mm/book3s64/hash_utils.c |   1 +
 arch/powerpc/mm/book3s64/slb.c|  27 
 drivers/misc/lkdtm/Makefile   |   1 +
 drivers/misc/lkdtm/core.c |   3 +
 drivers/misc/lkdtm/lkdtm.h|   3 +
 drivers/misc/lkdtm/powerpc.c  | 120 ++
 tools/testing/selftests/lkdtm/tests.txt   |   1 +
 8 files changed, 156 insertions(+), 28 deletions(-)
 create mode 100644 drivers/misc/lkdtm/powerpc.c

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 683a9c7d1b03..8b9f07900395 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -842,6 +842,32 @@ static inline unsigned long get_kernel_vsid(unsigned long 
ea, int ssize)
 
 unsigned htab_shift_for_mem_size(unsigned long mem_size);
 
-#endif /* __ASSEMBLY__ */
+enum slb_index {
+   LINEAR_INDEX= 0, /* Kernel linear map  (0xc000) */
+   KSTACK_INDEX= 1, /* Kernel stack map */
+};
 
+#define slb_esid_mask(ssize)   \
+   (((ssize) == MMU_SEGSIZE_256M) ? ESID_MASK : ESID_MASK_1T)
+
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+enum slb_index index)
+{
+   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
+}
+
+static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
+  unsigned long flags)
+{
+   return (vsid << slb_vsid_shift(ssize)) | flags |
+   ((unsigned long)ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+unsigned long flags)
+{
+   return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
+}
+
+#endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 24702c0a92e0..38076a998850 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -112,6 +112,7 @@ int mmu_linear_psize = MMU_PAGE_4K;
 EXPORT_SYMBOL_GPL(mmu_linear_psize);
 int mmu_virtual_psize = MMU_PAGE_4K;
 int mmu_vmalloc_psize = MMU_PAGE_4K;
+EXPORT_SYMBOL_GPL(mmu_vmalloc_psize);
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 int mmu_vmemmap_psize = MMU_PAGE_4K;
 #endif
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index c30fcbfa0e32..985706acb0e5 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -28,35 +28,8 @@
 #include "internal.h"
 
 
-enum slb_index {
-   LINEAR_INDEX= 0, /* Kernel linear map  (0xc000) */
-   KSTACK_INDEX= 1, /* Kernel stack map */
-};
-
 static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
 
-#define slb_esid_mask(ssize)   \
-   (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
-
-static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
-enum slb_index index)
-{
-   return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
-}
-
-static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
-unsigned long flags)
-{
-   return (vsid << slb_vsid_shift(ssize)) | flags |
-   ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
-}
-
-static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
-unsigned long flags)
-{
-   return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
-}
-
 bool stress_slb_enabled __initdata;
 
 static int __init parse_stress_slb(char *p)
diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile
index c70b3822013f..f37ecfb0a707 100644
--- a/drivers/misc/lkdtm/Makefile
+++ b/drivers/misc/lkdtm/Makefile
@@ -10,6 +10,7 @@ lkdtm-$(CONFIG_LKDTM) += rodata_objcopy.o
 lkdtm-$(CONFIG_LKDTM)  += usercopy.o
 lkdtm-$(CONFIG_LKDTM)

[PATCH] powerpc/mce: Remove per cpu variables from MCE handlers

2020-12-04 Thread Ganesh Goudar
Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/mce.h  |  2 +-
 arch/powerpc/include/asm/paca.h | 12 
 arch/powerpc/kernel/mce.c   | 54 +
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 89aa8248a57d..feef45f2b51b 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,7 @@ struct mce_error_info {
boolignore_event;
 };
 
-#define MAX_MC_EVT 100
+#define MAX_MC_EVT 10
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..4769954efa7d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -273,6 +274,17 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 63702c0badb9..5f53d02d6cbb 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -22,18 +22,6 @@
 #include 
 #include 
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-   mce_ue_event_queue);
-
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
@@ -103,8 +91,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
struct mce_error_info *mce_err,
uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-   int index = __this_cpu_inc_return(mce_nest_count) - 1;
-   struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
+   int index = get_paca()->mce_nest_count++;
+   struct machine_check_event *mce = &get_paca()->mce_event[index];
 
/*
 * Return if we don't have enough space to log mce event.
@@ -191,7 +179,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-   int index = __this_cpu_read(mce_nest_count) - 1;
+   int index = get_paca()->mce_nest_count - 1;
struct machine_check_event *mc_evt;
int ret = 0;
 
@@ -201,7 +189,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
 
/* Check if we have MCE info to process. */
if (index < MAX_MC_EVT) {
-   mc_evt = this_cpu_ptr(&mce_event[index]);
+   mc_evt = &get_paca()->mce_event[index];
/* Copy the event structure and release the original */
if (mce)
*mce = *mc_evt;
@@ -211,7 +199,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
}
/* Decrement the count to free the slot. */
if (release)
-   __this_cpu_dec(mce_nest_count);
+   get_paca()->mce_nest_count--;
 
return ret;
 }
@@ -233,13 +221,13 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
 {
int index;
 
-   index = __this_cpu_inc_return(mce_ue_count) - 1;
+   index = get_paca()->mce_ue_count++;
/* If queue is full, just return for now. */
if (index >= MAX_MC_EVT) {
-   __this_cpu_dec(mce_ue_cou

[PATCH v2] powerpc/mce: Remove per cpu variables from MCE handlers

2021-01-07 Thread Ganesh Goudar
Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100.

Signed-off-by: Ganesh Goudar 
---
v2: Dynamically allocate memory for machine check event info
---
 arch/powerpc/include/asm/mce.h | 21 +++-
 arch/powerpc/include/asm/paca.h|  4 ++
 arch/powerpc/kernel/mce.c  | 86 ++
 arch/powerpc/kernel/setup-common.c |  2 +-
 4 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index e6c27ae843dc..8d6e3a7a9f37 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,18 @@ struct mce_error_info {
boolignore_event;
 };
 
-#define MAX_MC_EVT 100
+#define MAX_MC_EVT 10
+
+struct mce_info {
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+};
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
@@ -233,5 +244,13 @@ long __machine_check_early_realmode_p7(struct pt_regs 
*regs);
 long __machine_check_early_realmode_p8(struct pt_regs *regs);
 long __machine_check_early_realmode_p9(struct pt_regs *regs);
 long __machine_check_early_realmode_p10(struct pt_regs *regs);
+#define get_mce_info() local_paca->mce_info
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_init(void);
+#else
+static inline void mce_init(void) { };
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..38e0c55e845d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -273,6 +274,9 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   struct mce_info *mce_info;
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9f3e133b57b7..14142ddbedf2 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -17,23 +17,12 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-   mce_ue_event_queue);
-
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
@@ -103,8 +92,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
struct mce_error_info *mce_err,
uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-   int index = __this_cpu_inc_return(mce_nest_count) - 1;
-   struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
+   int index = get_mce_info()->mce_nest_count++;
+   struct machine_check_event *mce = &get_mce_info()->mce_event[index];
 
/*
 * Return if we don't have enough space to log mce event.
@@ -191,7 +180,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-   int index = __this_cpu_read(mce_nest_count) - 1;
+   int index = get_mce_info()->mce_nest_count - 1;
struct machine_check_event *mc_evt;
int ret = 0;
 
@@ -201,7 +190,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
 
/* Check if we have MCE info to process. */
if (index < MAX_MC_EVT) {
-   mc_evt = this_cpu_ptr(&mce_event[index]);
+   mc_evt = &get_mce_info()->mce_event[index];

[PATCH v3] powerpc/mce: Remove per cpu variables from MCE handlers

2021-01-15 Thread Ganesh Goudar
Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100.

Signed-off-by: Ganesh Goudar 
---
v2: Dynamically allocate memory for machine check event info

v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid
to allocate memory.
---
 arch/powerpc/include/asm/mce.h | 21 -
 arch/powerpc/include/asm/paca.h|  4 ++
 arch/powerpc/kernel/mce.c  | 76 +-
 arch/powerpc/kernel/setup-common.c |  2 +-
 4 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index e6c27ae843dc..8d6e3a7a9f37 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,18 @@ struct mce_error_info {
boolignore_event;
 };
 
-#define MAX_MC_EVT 100
+#define MAX_MC_EVT 10
+
+struct mce_info {
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+};
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
@@ -233,5 +244,13 @@ long __machine_check_early_realmode_p7(struct pt_regs 
*regs);
 long __machine_check_early_realmode_p8(struct pt_regs *regs);
 long __machine_check_early_realmode_p9(struct pt_regs *regs);
 long __machine_check_early_realmode_p10(struct pt_regs *regs);
+#define get_mce_info() local_paca->mce_info
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_init(void);
+#else
+static inline void mce_init(void) { };
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..38e0c55e845d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -273,6 +274,9 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   struct mce_info *mce_info;
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9f3e133b57b7..feeb3231b541 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -17,22 +17,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-   mce_ue_event_queue);
+#include "setup.h"
 
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
@@ -103,8 +94,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
struct mce_error_info *mce_err,
uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-   int index = __this_cpu_inc_return(mce_nest_count) - 1;
-   struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
+   int index = get_mce_info()->mce_nest_count++;
+   struct machine_check_event *mce = &get_mce_info()->mce_event[index];
 
/*
 * Return if we don't have enough space to log mce event.
@@ -191,7 +182,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-   int index = __this_cpu_read(mce_nest_count) - 1;
+   int index = get_mce_info()->mce_nest_count - 1;
struct machine_check_event *mc_evt;
int ret = 0;
 
@@ -201,7 +192,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
 
/* Check if we have MCE info to process. */
if (index < MAX_MC_EVT) {
-   mc_evt = this_cpu_ptr(&mce_event[index]);
+  

[PATCH v4 1/2] powerpc/mce: Reduce the size of event arrays

2021-01-22 Thread Ganesh Goudar
Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100. This saves us ~19kB
of memory and has no fatal consequences.

Signed-off-by: Ganesh Goudar 
---
v4: This patch is a fragment of the orignal patch which is 
split into two.
---
 arch/powerpc/include/asm/mce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index e6c27ae843dc..7d8b6679ec68 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,7 @@ struct mce_error_info {
boolignore_event;
 };
 
-#define MAX_MC_EVT 100
+#define MAX_MC_EVT 10
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
-- 
2.26.2



[PATCH v4 2/2] powerpc/mce: Remove per cpu variables from MCE handlers

2021-01-22 Thread Ganesh Goudar
Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Signed-off-by: Ganesh Goudar 
---
v2: Dynamically allocate memory for machine check event info

v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid
to allocate memory.

v4: Spliting the patch into two.
---
 arch/powerpc/include/asm/mce.h | 18 +++
 arch/powerpc/include/asm/paca.h|  4 ++
 arch/powerpc/kernel/mce.c  | 79 ++
 arch/powerpc/kernel/setup-common.c |  2 +-
 4 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 7d8b6679ec68..331d944280b8 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -206,6 +206,17 @@ struct mce_error_info {
 
 #define MAX_MC_EVT 10
 
+struct mce_info {
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+};
+
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
 #define MCE_EVENT_DONTRELEASE  false
@@ -234,4 +245,11 @@ long __machine_check_early_realmode_p8(struct pt_regs 
*regs);
 long __machine_check_early_realmode_p9(struct pt_regs *regs);
 long __machine_check_early_realmode_p10(struct pt_regs *regs);
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_init(void);
+#else
+static inline void mce_init(void) { };
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..38e0c55e845d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -273,6 +274,9 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   struct mce_info *mce_info;
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9f3e133b57b7..6ec5c68997ed 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -17,22 +17,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-   mce_ue_event_queue);
+#include "setup.h"
 
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
@@ -103,9 +94,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
struct mce_error_info *mce_err,
uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-   int index = __this_cpu_inc_return(mce_nest_count) - 1;
-   struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
+   int index = local_paca->mce_info->mce_nest_count++;
+   struct machine_check_event *mce;
 
+   mce = &local_paca->mce_info->mce_event[index];
/*
 * Return if we don't have enough space to log mce event.
 * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
@@ -191,7 +183,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-   int index = __this_cpu_read(mce_nest_count) - 1;
+   int index = local_paca->mce_info->mce_nest_count - 1;
struct machine_check_event *mc_evt;
int ret = 0;
 
@@ -201,7 +193,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
 
/* Check if we have MCE info to process. */
if (index < MAX_MC_EVT) {
-   mc_evt = this_cpu_ptr(&mce_event[index]);
+   mc_evt = &local_paca->mce_info->mce_event[index];
/* Copy the event structure and release the original */

[PATCH v5 1/2] powerpc/mce: Reduce the size of event arrays

2021-01-28 Thread Ganesh Goudar
Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100. This saves us ~19kB
of memory and has no fatal consequences.

Signed-off-by: Ganesh Goudar 
---
v4: This patch is a fragment of the orignal patch which is 
split into two.

v5: No changes.
---
 arch/powerpc/include/asm/mce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index e6c27ae843dc..7d8b6679ec68 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,7 @@ struct mce_error_info {
boolignore_event;
 };
 
-#define MAX_MC_EVT 100
+#define MAX_MC_EVT 10
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
-- 
2.26.2



[PATCH v5 2/2] powerpc/mce: Remove per cpu variables from MCE handlers

2021-01-28 Thread Ganesh Goudar
Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Signed-off-by: Ganesh Goudar 
---
v2: Dynamically allocate memory for machine check event info.

v3: Remove check for hash mmu lpar, use memblock_alloc_try_nid
to allocate memory.

v4: Spliting the patch into two.

v5: Fix build error for PPC32.
---
 arch/powerpc/include/asm/mce.h | 18 +++
 arch/powerpc/include/asm/paca.h|  4 ++
 arch/powerpc/kernel/mce.c  | 79 ++
 arch/powerpc/kernel/setup-common.c |  2 +
 4 files changed, 71 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 7d8b6679ec68..331d944280b8 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -206,6 +206,17 @@ struct mce_error_info {
 
 #define MAX_MC_EVT 10
 
+struct mce_info {
+   int mce_nest_count;
+   struct machine_check_event mce_event[MAX_MC_EVT];
+   /* Queue for delayed MCE events. */
+   int mce_queue_count;
+   struct machine_check_event mce_event_queue[MAX_MC_EVT];
+   /* Queue for delayed MCE UE events. */
+   int mce_ue_count;
+   struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+};
+
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE  true
 #define MCE_EVENT_DONTRELEASE  false
@@ -234,4 +245,11 @@ long __machine_check_early_realmode_p8(struct pt_regs 
*regs);
 long __machine_check_early_realmode_p9(struct pt_regs *regs);
 long __machine_check_early_realmode_p10(struct pt_regs *regs);
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_init(void);
+#else
+static inline void mce_init(void) { };
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..38e0c55e845d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -273,6 +274,9 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   struct mce_info *mce_info;
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9f3e133b57b7..6ec5c68997ed 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -17,22 +17,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-   mce_ue_event_queue);
+#include "setup.h"
 
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
@@ -103,9 +94,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
struct mce_error_info *mce_err,
uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-   int index = __this_cpu_inc_return(mce_nest_count) - 1;
-   struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
+   int index = local_paca->mce_info->mce_nest_count++;
+   struct machine_check_event *mce;
 
+   mce = &local_paca->mce_info->mce_event[index];
/*
 * Return if we don't have enough space to log mce event.
 * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
@@ -191,7 +183,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-   int index = __this_cpu_read(mce_nest_count) - 1;
+   int index = local_paca->mce_info->mce_nest_count - 1;
struct machine_check_event *mc_evt;
int ret = 0;
 
@@ -201,7 +193,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
 
/* Check if we have MCE info to process. */
if (index < MAX_MC_EVT) {
-   mc_evt = this_cpu_ptr(&mce_event[index]);
+   mc_evt = &local_paca->mce_info->mce_event[index];
/* Copy the even

[PATCH] powerpc/mce: Avoid using addr_to_pfn in realmode

2020-06-20 Thread Ganesh Goudar
When an UE or memory error exception is encountered the MCE handler
tries to find the pfn using addr_to_pfn() which takes effective
address as an argument, later pfn is used to poison the page where
memory error occurred, recent rework in this area made addr_to_pfn
to run in realmode, which can be fatal as it may try to access
memory outside RMO region.

To fix this move the use of addr_to_pfn to save_mce_event(), which
runs in virtual mode.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c|  7 +
 arch/powerpc/kernel/mce_power.c  | 39 +++-
 arch/powerpc/platforms/pseries/ras.c | 12 ++---
 3 files changed, 18 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd90c0eda229..c5581a742367 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -148,6 +148,13 @@ void save_mce_event(struct pt_regs *regs, long handled,
} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
mce->u.ue_error.effective_address_provided = true;
mce->u.ue_error.effective_address = addr;
+   if (phys_addr == ULONG_MAX && mce->sync_error && addr) {
+   unsigned long pfn;
+
+   pfn = addr_to_pfn(regs, addr);
+   if (pfn != ULONG_MAX)
+   phys_addr = pfn << PAGE_SHIFT;
+   }
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index c3b522bff9b4..1b2582818f2b 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -361,8 +361,7 @@ static const struct mce_derror_table mce_p9_derror_table[] 
= {
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0, false, 0, 0, 0, 0, 0 } };
 
-static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr,
-   uint64_t *phys_addr)
+static int mce_find_instr_ea(struct pt_regs *regs, uint64_t *addr)
 {
/*
 * Carefully look at the NIP to determine
@@ -380,9 +379,7 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, 
uint64_t *addr,
instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
instr = ppc_inst_read((struct ppc_inst *)instr_addr);
if (!analyse_instr(&op, &tmp, instr)) {
-   pfn = addr_to_pfn(regs, op.ea);
*addr = op.ea;
-   *phys_addr = (pfn << PAGE_SHIFT);
return 0;
}
/*
@@ -398,8 +395,7 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, 
uint64_t *addr,
 
 static int mce_handle_ierror(struct pt_regs *regs,
const struct mce_ierror_table table[],
-   struct mce_error_info *mce_err, uint64_t *addr,
-   uint64_t *phys_addr)
+   struct mce_error_info *mce_err, uint64_t *addr)
 {
uint64_t srr1 = regs->msr;
int handled = 0;
@@ -455,21 +451,8 @@ static int mce_handle_ierror(struct pt_regs *regs,
mce_err->sync_error = table[i].sync_error;
mce_err->severity = table[i].severity;
mce_err->initiator = table[i].initiator;
-   if (table[i].nip_valid) {
+   if (table[i].nip_valid)
*addr = regs->nip;
-   if (mce_err->sync_error &&
-   table[i].error_type == MCE_ERROR_TYPE_UE) {
-   unsigned long pfn;
-
-   if (get_paca()->in_mce < MAX_MCE_DEPTH) {
-   pfn = addr_to_pfn(regs, regs->nip);
-   if (pfn != ULONG_MAX) {
-   *phys_addr =
-   (pfn << PAGE_SHIFT);
-   }
-   }
-   }
-   }
return handled;
}
 
@@ -484,8 +467,7 @@ static int mce_handle_ierror(struct pt_regs *regs,
 
 static int mce_handle_derror(struct pt_regs *regs,
const struct mce_derror_table table[],
-   struct mce_error_info *mce_err, uint64_t *addr,
-   uint64_t *phys_addr)
+   struct mce_error_info *mce_err, uint64_t *addr)
 {
uint64_t dsisr = regs->dsisr;
int handled = 0;
@@ -562,8 +544,7 @@ static int mce_handle_derror(struct pt_regs *regs,
 * kernel/exception-64s.h
 */
  

[PATCH 3/3] powerpc/mce: Modify the real address error logging messages

2021-07-30 Thread Ganesh Goudar
To avoid ambiguity, modify the strings in real address error
logging messages to "foreign/control memory" from "foreign",
Since the error discriptions in P9 user manual and P10 user
manual are different for same type of errors.

P9 User Manual for MCE:
DSISR:59 Host real address to foreign space during translation.
DSISR:60 Host real address to foreign space on a load or store
 access.

P10 User Manual for MCE:
DSISR:59 D-side tablewalk used a host real address in the
 control memory address range.
DSISR:60 D-side operand access to control memory address space.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..f3ef480bb739 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -388,14 +388,14 @@ void machine_check_print_event_info(struct 
machine_check_event *evt,
static const char *mc_ra_types[] = {
"Indeterminate",
"Instruction fetch (bad)",
-   "Instruction fetch (foreign)",
+   "Instruction fetch (foreign/control memory)",
"Page table walk ifetch (bad)",
-   "Page table walk ifetch (foreign)",
+   "Page table walk ifetch (foreign/control memory)",
"Load (bad)",
"Store (bad)",
"Page table walk Load/Store (bad)",
-   "Page table walk Load/Store (foreign)",
-   "Load/Store (foreign)",
+   "Page table walk Load/Store (foreign/control memory)",
+   "Load/Store (foreign/control memory)",
};
static const char *mc_link_types[] = {
"Indeterminate",
-- 
2.31.1



[PATCH 2/3] selftests/powerpc: Add test for real address error handling

2021-07-30 Thread Ganesh Goudar
Add test for real address or control memory address access
error handling, using NX-GZIP engine.

The error is injected by accessing the control memory address
using illegal instruction, on successful handling the process
attempting to access control memory address using illegal
instruction receives SIGBUS.

Signed-off-by: Ganesh Goudar 
---
 tools/testing/selftests/powerpc/Makefile  |  3 +-
 tools/testing/selftests/powerpc/mce/Makefile  |  6 +++
 .../selftests/powerpc/mce/inject-ra-err.c | 42 +++
 .../selftests/powerpc/mce/inject-ra-err.sh| 19 +
 4 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mce/Makefile
 create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c
 create mode 100755 tools/testing/selftests/powerpc/mce/inject-ra-err.sh

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 0830e63818c1..4830372d7416 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -31,7 +31,8 @@ SUB_DIRS = alignment  \
   vphn \
   math \
   ptrace   \
-  security
+  security \
+  mce
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/mce/Makefile 
b/tools/testing/selftests/powerpc/mce/Makefile
new file mode 100644
index ..0f537ce86370
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/Makefile
@@ -0,0 +1,6 @@
+#SPDX-License-Identifier: GPL-2.0-or-later
+
+TEST_PROGS := inject-ra-err.sh
+TEST_GEN_FILES := inject-ra-err
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
new file mode 100644
index ..58374bc92e90
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+int main(void)
+{
+   int fd, ret;
+   int *paste_addr;
+   struct vas_tx_win_open_attr attr;
+   char *devname = "/dev/crypto/nx-gzip";
+
+   memset(&attr, 0, sizeof(attr));
+   attr.version = 1;
+   attr.vas_id = 0;
+
+   fd = open(devname, O_RDWR);
+   if (fd < 0) {
+   fprintf(stderr, "Failed to open device %s\n", devname);
+   return -errno;
+   }
+   ret = ioctl(fd, VAS_TX_WIN_OPEN, &attr);
+   if (ret < 0) {
+   fprintf(stderr, "ioctl() n %d, error %d\n", ret, errno);
+   ret = -errno;
+   goto out;
+   }
+   paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 
0ULL);
+   /* The following assignment triggers exception */
+   *paste_addr = 1;
+   ret = 0;
+out:
+   close(fd);
+   return ret;
+}
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.sh 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
new file mode 100755
index ..0e9c8ae6ad78
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+if [[ ! -w /dev/crypto/nx-gzip ]]; then
+   echo "WARN: Can't access /dev/crypto/nx-gzip, skipping"
+   exit 0
+fi
+
+# Timeout in 5 seconds, If not handled it may run indefinitely.
+timeout 5 ./inject-ra-err
+
+# 128 + 7 (SIGBUS) = 135, 128 is a exit Code With Special Meaning.
+if [ $? -ne 135 ]; then
+   echo "FAILED: Control memory access error not handled"
+   exit $?
+fi
+
+echo "OK: Control memory access error is handled"
+exit 0
-- 
2.31.1



[PATCH 1/3] powerpc/pseries: Parse control memory access error

2021-07-30 Thread Ganesh Goudar
Add support to parse and log control memory access
error for pseries.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 167f2e1b8d39..608c35cad0c3 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -80,6 +80,7 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TYPE_TLB  0x04
 #define MC_ERROR_TYPE_D_CACHE  0x05
 #define MC_ERROR_TYPE_I_CACHE  0x07
+#define MC_ERROR_TYPE_CTRL_MEM_ACCESS  0x08
 
 /* RTAS pseries MCE error sub types */
 #define MC_ERROR_UE_INDETERMINATE  0
@@ -103,6 +104,9 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TLB_MULTIHIT  2
 #define MC_ERROR_TLB_INDETERMINATE 3
 
+#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK   0
+#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1
+
 static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
 {
switch (mlog->error_type) {
@@ -112,6 +116,8 @@ static inline u8 rtas_mc_error_sub_type(const struct 
pseries_mc_errorlog *mlog)
caseMC_ERROR_TYPE_ERAT:
caseMC_ERROR_TYPE_TLB:
return (mlog->sub_err_type & 0x03);
+   caseMC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   return (mlog->sub_err_type & 0x70) >> 4;
default:
return 0;
}
@@ -699,6 +705,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
case MC_ERROR_TYPE_I_CACHE:
mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
+   case MC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   mce_err.error_type = MCE_ERROR_TYPE_RA;
+   if (mce_log->sub_err_type & 0x80)
+   eaddr = be64_to_cpu(mce_log->effective_address);
+   switch (err_sub_type) {
+   case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
+   break;
+   case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_LOAD_STORE_FOREIGN;
+   break;
+   }
+   break;
case MC_ERROR_TYPE_UNKNOWN:
default:
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
-- 
2.31.1



[PATCH v2 1/3] powerpc/pseries: Parse control memory access error

2021-08-05 Thread Ganesh Goudar
Add support to parse and log control memory access
error for pseries.

Signed-off-by: Ganesh Goudar 
---
v2: No changes in this patch.
---
 arch/powerpc/platforms/pseries/ras.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 167f2e1b8d39..608c35cad0c3 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -80,6 +80,7 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TYPE_TLB  0x04
 #define MC_ERROR_TYPE_D_CACHE  0x05
 #define MC_ERROR_TYPE_I_CACHE  0x07
+#define MC_ERROR_TYPE_CTRL_MEM_ACCESS  0x08
 
 /* RTAS pseries MCE error sub types */
 #define MC_ERROR_UE_INDETERMINATE  0
@@ -103,6 +104,9 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TLB_MULTIHIT  2
 #define MC_ERROR_TLB_INDETERMINATE 3
 
+#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK   0
+#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1
+
 static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
 {
switch (mlog->error_type) {
@@ -112,6 +116,8 @@ static inline u8 rtas_mc_error_sub_type(const struct 
pseries_mc_errorlog *mlog)
caseMC_ERROR_TYPE_ERAT:
caseMC_ERROR_TYPE_TLB:
return (mlog->sub_err_type & 0x03);
+   caseMC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   return (mlog->sub_err_type & 0x70) >> 4;
default:
return 0;
}
@@ -699,6 +705,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
case MC_ERROR_TYPE_I_CACHE:
mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
+   case MC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   mce_err.error_type = MCE_ERROR_TYPE_RA;
+   if (mce_log->sub_err_type & 0x80)
+   eaddr = be64_to_cpu(mce_log->effective_address);
+   switch (err_sub_type) {
+   case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
+   break;
+   case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_LOAD_STORE_FOREIGN;
+   break;
+   }
+   break;
case MC_ERROR_TYPE_UNKNOWN:
default:
mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
-- 
2.31.1



[PATCH v2 2/3] selftests/powerpc: Add test for real address error handling

2021-08-05 Thread Ganesh Goudar
Add test for real address or control memory address access
error handling, using NX-GZIP engine.

The error is injected by accessing the control memory address
using illegal instruction, on successful handling the process
attempting to access control memory address using illegal
instruction receives SIGBUS.

Signed-off-by: Ganesh Goudar 
---
v2: Fix build error.
---
 tools/testing/selftests/powerpc/Makefile  |  3 +-
 tools/testing/selftests/powerpc/mce/Makefile  |  6 +++
 .../selftests/powerpc/mce/inject-ra-err.c | 42 +++
 .../selftests/powerpc/mce/inject-ra-err.sh| 18 
 tools/testing/selftests/powerpc/mce/vas-api.h |  1 +
 5 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mce/Makefile
 create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c
 create mode 100755 tools/testing/selftests/powerpc/mce/inject-ra-err.sh
 create mode 12 tools/testing/selftests/powerpc/mce/vas-api.h

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 0830e63818c1..4830372d7416 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -31,7 +31,8 @@ SUB_DIRS = alignment  \
   vphn \
   math \
   ptrace   \
-  security
+  security \
+  mce
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/mce/Makefile 
b/tools/testing/selftests/powerpc/mce/Makefile
new file mode 100644
index ..0f537ce86370
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/Makefile
@@ -0,0 +1,6 @@
+#SPDX-License-Identifier: GPL-2.0-or-later
+
+TEST_PROGS := inject-ra-err.sh
+TEST_GEN_FILES := inject-ra-err
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
new file mode 100644
index ..05ab11cec3da
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "vas-api.h"
+
+int main(void)
+{
+   int fd, ret;
+   int *paste_addr;
+   struct vas_tx_win_open_attr attr;
+   char *devname = "/dev/crypto/nx-gzip";
+
+   memset(&attr, 0, sizeof(attr));
+   attr.version = 1;
+   attr.vas_id = 0;
+
+   fd = open(devname, O_RDWR);
+   if (fd < 0) {
+   fprintf(stderr, "Failed to open device %s\n", devname);
+   return -errno;
+   }
+   ret = ioctl(fd, VAS_TX_WIN_OPEN, &attr);
+   if (ret < 0) {
+   fprintf(stderr, "ioctl() n %d, error %d\n", ret, errno);
+   ret = -errno;
+   goto out;
+   }
+   paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 
0ULL);
+   /* The following assignment triggers exception */
+   *paste_addr = 1;
+   ret = 0;
+out:
+   close(fd);
+   return ret;
+}
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.sh 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
new file mode 100755
index ..3633cdc651a1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+if [[ ! -w /dev/crypto/nx-gzip ]]; then
+   echo "WARN: Can't access /dev/crypto/nx-gzip, skipping"
+   exit 0
+fi
+
+timeout 5 ./inject-ra-err
+
+# 128 + 7 (SIGBUS) = 135, 128 is a exit code with special meaning.
+if [ $? -ne 135 ]; then
+   echo "FAILED: Real address or Control memory access error not handled"
+   exit $?
+fi
+
+echo "OK: Real address or Control memory access error is handled"
+exit 0
diff --git a/tools/testing/selftests/powerpc/mce/vas-api.h 
b/tools/testing/selftests/powerpc/mce/vas-api.h
new file mode 12
index ..1455c1bcd351
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/vas-api.h
@@ -0,0 +1 @@
+../../../../../arch/powerpc/include/uapi/asm/vas-api.h
\ No newline at end of file
-- 
2.31.1



[PATCH v2 3/3] powerpc/mce: Modify the real address error logging messages

2021-08-05 Thread Ganesh Goudar
To avoid ambiguity, modify the strings in real address error
logging messages to "foreign/control memory" from "foreign",
Since the error discriptions in P9 user manual and P10 user
manual are different for same type of errors.

P9 User Manual for MCE:
DSISR:59 Host real address to foreign space during translation.
DSISR:60 Host real address to foreign space on a load or store
 access.

P10 User Manual for MCE:
DSISR:59 D-side tablewalk used a host real address in the
 control memory address range.
DSISR:60 D-side operand access to control memory address space.

Signed-off-by: Ganesh Goudar 
---
v2: No changes in this patch.
---
 arch/powerpc/kernel/mce.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..f3ef480bb739 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -388,14 +388,14 @@ void machine_check_print_event_info(struct 
machine_check_event *evt,
static const char *mc_ra_types[] = {
"Indeterminate",
"Instruction fetch (bad)",
-   "Instruction fetch (foreign)",
+   "Instruction fetch (foreign/control memory)",
"Page table walk ifetch (bad)",
-   "Page table walk ifetch (foreign)",
+   "Page table walk ifetch (foreign/control memory)",
"Load (bad)",
"Store (bad)",
"Page table walk Load/Store (bad)",
-   "Page table walk Load/Store (foreign)",
-   "Load/Store (foreign)",
+   "Page table walk Load/Store (foreign/control memory)",
+   "Load/Store (foreign/control memory)",
};
static const char *mc_link_types[] = {
"Indeterminate",
-- 
2.31.1



[PATCH] powerpc/mce: check if event info is valid

2021-08-06 Thread Ganesh Goudar
Check if the event info is valid before printing the
event information. When a fwnmi enabled nested kvm guest
hits a machine check exception L0 and L2 would generate
machine check event info, But L1 would not generate any
machine check event info as it won't go through 0x200
vector and prints some unwanted message.

To fix this, 'in_use' variable in machine check event info is
no more in use, rename it to 'valid' and check if the event
information is valid before logging the event information.

without this patch L1 would print following message for
exceptions encountered in L2, as event structure will be
empty in L1.

"Machine Check Exception, Unknown event version 0".

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/mce.h | 2 +-
 arch/powerpc/kernel/mce.c  | 7 +--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..3646f53f228f 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -113,7 +113,7 @@ enum MCE_LinkErrorType {
 
 struct machine_check_event {
enum MCE_Versionversion:8;
-   u8  in_use;
+   u8  valid;
enum MCE_Severity   severity:8;
enum MCE_Initiator  initiator:8;
enum MCE_ErrorType  error_type:8;
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 47a683cd00d2..b778394a06b5 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -114,7 +114,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->srr0 = nip;
mce->srr1 = regs->msr;
mce->gpr3 = regs->gpr[3];
-   mce->in_use = 1;
+   mce->valid = 1;
mce->cpu = get_paca()->paca_index;
 
/* Mark it recovered if we have handled it and MSR(RI=1). */
@@ -202,7 +202,7 @@ int get_mce_event(struct machine_check_event *mce, bool 
release)
if (mce)
*mce = *mc_evt;
if (release)
-   mc_evt->in_use = 0;
+   mc_evt->valid = 0;
ret = 1;
}
/* Decrement the count to free the slot. */
@@ -413,6 +413,9 @@ void machine_check_print_event_info(struct 
machine_check_event *evt,
"Probable Software error (some chance of hardware cause)",
};
 
+   if (!evt->valid)
+   return;
+
/* Print things out */
if (evt->version != MCE_V1) {
pr_err("Machine Check Exception, Unknown event version %d !\n",
-- 
2.31.1



[PATCH v3 RESEND 2/3] selftests/powerpc: Add test for real address error handling

2022-01-07 Thread Ganesh Goudar
Add test for real address or control memory address access
error handling, using NX-GZIP engine.

The error is injected by accessing the control memory address
using illegal instruction, on successful handling the process
attempting to access control memory address using illegal
instruction receives SIGBUS.

Signed-off-by: Ganesh Goudar 
---
 tools/testing/selftests/powerpc/Makefile  |  3 +-
 tools/testing/selftests/powerpc/mce/Makefile  |  7 ++
 .../selftests/powerpc/mce/inject-ra-err.c | 65 +++
 tools/testing/selftests/powerpc/mce/vas-api.h |  1 +
 4 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mce/Makefile
 create mode 100644 tools/testing/selftests/powerpc/mce/inject-ra-err.c
 create mode 12 tools/testing/selftests/powerpc/mce/vas-api.h

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 0830e63818c1..4830372d7416 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -31,7 +31,8 @@ SUB_DIRS = alignment  \
   vphn \
   math \
   ptrace   \
-  security
+  security \
+  mce
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/mce/Makefile 
b/tools/testing/selftests/powerpc/mce/Makefile
new file mode 100644
index ..2424513982d9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/Makefile
@@ -0,0 +1,7 @@
+#SPDX-License-Identifier: GPL-2.0-or-later
+
+TEST_GEN_PROGS := inject-ra-err
+
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c
diff --git a/tools/testing/selftests/powerpc/mce/inject-ra-err.c 
b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
new file mode 100644
index ..94323c34d9a6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/inject-ra-err.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vas-api.h"
+#include "utils.h"
+
+static bool faulted;
+
+static void sigbus_handler(int n, siginfo_t *info, void *ctxt_v)
+{
+   ucontext_t *ctxt = (ucontext_t *)ctxt_v;
+   struct pt_regs *regs = ctxt->uc_mcontext.regs;
+
+   faulted = true;
+   regs->nip += 4;
+}
+
+static int test_ra_error(void)
+{
+   struct vas_tx_win_open_attr attr;
+   int fd, *paste_addr;
+   char *devname = "/dev/crypto/nx-gzip";
+   struct sigaction act = {
+   .sa_sigaction = sigbus_handler,
+   .sa_flags = SA_SIGINFO,
+   };
+
+   memset(&attr, 0, sizeof(attr));
+   attr.version = 1;
+   attr.vas_id = 0;
+
+   SKIP_IF(access(devname, F_OK));
+
+   fd = open(devname, O_RDWR);
+   FAIL_IF(fd < 0);
+   FAIL_IF(ioctl(fd, VAS_TX_WIN_OPEN, &attr) < 0);
+   FAIL_IF(sigaction(SIGBUS, &act, NULL) != 0);
+
+   paste_addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 
0ULL);
+
+   /* The following assignment triggers exception */
+   mb();
+   *paste_addr = 1;
+   mb();
+
+   FAIL_IF(!faulted);
+
+   return 0;
+}
+
+int main(void)
+{
+   return test_harness(test_ra_error, "inject-ra-err");
+}
+
diff --git a/tools/testing/selftests/powerpc/mce/vas-api.h 
b/tools/testing/selftests/powerpc/mce/vas-api.h
new file mode 12
index ..1455c1bcd351
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mce/vas-api.h
@@ -0,0 +1 @@
+../../../../../arch/powerpc/include/uapi/asm/vas-api.h
\ No newline at end of file
-- 
2.31.1



[PATCH v3 RESEND 1/3] powerpc/pseries: Parse control memory access error

2022-01-07 Thread Ganesh Goudar
Add support to parse and log control memory access
error for pseries. These changes are made according to
PAPR v2.11 10.3.2.2.12.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/platforms/pseries/ras.c | 36 
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 56092dccfdb8..e62a0ca2611a 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -60,11 +60,17 @@ struct pseries_mc_errorlog {
 *  XX  2: Reserved.
 *XXX   3: Type of UE error.
 *
-* For error_type != MC_ERROR_TYPE_UE
+* For error_type == MC_ERROR_TYPE_SLB/ERAT/TLB
 *   
 *   X  1: Effective address provided.
 *X 5: Reserved.
 * XX   2: Type of SLB/ERAT/TLB error.
+*
+* For error_type == MC_ERROR_TYPE_CTRL_MEM_ACCESS
+*   
+*   X  1: Error causing address provided.
+*XXX   3: Type of error.
+*      4: Reserved.
 */
u8  sub_err_type;
u8  reserved_1[6];
@@ -80,6 +86,7 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TYPE_TLB  0x04
 #define MC_ERROR_TYPE_D_CACHE  0x05
 #define MC_ERROR_TYPE_I_CACHE  0x07
+#define MC_ERROR_TYPE_CTRL_MEM_ACCESS  0x08
 
 /* RTAS pseries MCE error sub types */
 #define MC_ERROR_UE_INDETERMINATE  0
@@ -90,6 +97,7 @@ struct pseries_mc_errorlog {
 
 #define UE_EFFECTIVE_ADDR_PROVIDED 0x40
 #define UE_LOGICAL_ADDR_PROVIDED   0x20
+#define MC_EFFECTIVE_ADDR_PROVIDED 0x80
 
 #define MC_ERROR_SLB_PARITY0
 #define MC_ERROR_SLB_MULTIHIT  1
@@ -103,6 +111,9 @@ struct pseries_mc_errorlog {
 #define MC_ERROR_TLB_MULTIHIT  2
 #define MC_ERROR_TLB_INDETERMINATE 3
 
+#define MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK   0
+#define MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS 1
+
 static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog)
 {
switch (mlog->error_type) {
@@ -112,6 +123,8 @@ static inline u8 rtas_mc_error_sub_type(const struct 
pseries_mc_errorlog *mlog)
caseMC_ERROR_TYPE_ERAT:
caseMC_ERROR_TYPE_TLB:
return (mlog->sub_err_type & 0x03);
+   caseMC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   return (mlog->sub_err_type & 0x70) >> 4;
default:
return 0;
}
@@ -656,7 +669,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_ERAT:
@@ -673,7 +686,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.erat_error_type = 
MCE_ERAT_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_TLB:
@@ -690,7 +703,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE;
break;
}
-   if (mce_log->sub_err_type & 0x80)
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
eaddr = be64_to_cpu(mce_log->effective_address);
break;
case MC_ERROR_TYPE_D_CACHE:
@@ -699,6 +712,21 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
case MC_ERROR_TYPE_I_CACHE:
mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
break;
+   case MC_ERROR_TYPE_CTRL_MEM_ACCESS:
+   mce_err.error_type = MCE_ERROR_TYPE_RA;
+   switch (err_sub_type) {
+   case MC_ERROR_CTRL_MEM_ACCESS_PTABLE_WALK:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
+   break;
+   case MC_ERROR_CTRL_MEM_ACCESS_OP_ACCESS:
+   mce_err.u.ra_error_type =
+   MCE_RA_ERROR_LOAD_STORE_FOREIGN;
+   break;
+   }
+   if (mce_log->sub_err_type & MC_EFFECTIVE_ADDR_PROVIDED)
+   eaddr = be64_to_cpu(mce_log->effective_address);
+   break;
   

[PATCH v3 RESEND 3/3] powerpc/mce: Modify the real address error logging messages

2022-01-07 Thread Ganesh Goudar
To avoid ambiguity, modify the strings in real address error
logging messages to "foreign/control memory" from "foreign",
Since the error discriptions in P9 user manual and P10 user
manual are different for same type of errors.

P9 User Manual for MCE:
DSISR:59 Host real address to foreign space during translation.
DSISR:60 Host real address to foreign space on a load or store
 access.

P10 User Manual for MCE:
DSISR:59 D-side tablewalk used a host real address in the
 control memory address range.
DSISR:60 D-side operand access to control memory address space.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..55ccc651d1b0 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -401,14 +401,14 @@ void machine_check_print_event_info(struct 
machine_check_event *evt,
static const char *mc_ra_types[] = {
"Indeterminate",
"Instruction fetch (bad)",
-   "Instruction fetch (foreign)",
+   "Instruction fetch (foreign/control memory)",
"Page table walk ifetch (bad)",
-   "Page table walk ifetch (foreign)",
+   "Page table walk ifetch (foreign/control memory)",
"Load (bad)",
"Store (bad)",
"Page table walk Load/Store (bad)",
-   "Page table walk Load/Store (foreign)",
-   "Load/Store (foreign)",
+   "Page table walk Load/Store (foreign/control memory)",
+   "Load/Store (foreign/control memory)",
};
static const char *mc_link_types[] = {
"Indeterminate",
-- 
2.31.1



[PATCH v4] powerpc/mce: Avoid using irq_work_queue() in realmode

2022-01-17 Thread Ganesh Goudar
In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar 
---
V2:
* Use arch_irq_work_raise to raise decrementer interrupt.
* Avoid having atomic variable.

V3:
* Fix build error.
  Reported by kernel test bot.

V4:
* Rename some functions and variables
* Remove mces_to_process counter and add a flag to indicate
  there is a mce info to process.
---
 arch/powerpc/include/asm/machdep.h   |  2 +
 arch/powerpc/include/asm/mce.h   | 13 +
 arch/powerpc/include/asm/paca.h  |  1 +
 arch/powerpc/kernel/mce.c| 60 +---
 arch/powerpc/kernel/time.c   |  2 +
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c | 31 +---
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 53 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 9c3c9f04129f..d22b222ba471 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -99,6 +99,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
 
+   void(*machine_check_log_err)(void);
+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..c9f0936bd3c9 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,21 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+void mce_irq_work_queue(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_run_irq_context_handlers(void);
+#else
+static inline void mce_run_irq_context_handlers(void) { };
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void set_mce_pending_irq_work(void);
+void clear_mce_pending_irq_work(void);
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index dc05a862e72a..963030689cfa 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -280,6 +280,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   u8 mce_pending_irq_work;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd829f7f25a4..6af798803ece 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,13 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
 }
 
+void mce_irq_work_queue(void)
+{
+   /* Raise decrementer interrupt */
+   arch_irq_work_raise();
+   set_mce_pending_irq_work();
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -217,7 +214,7 @@ void release_mce_event(void)
get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
schedule_work(&mce_ue_event_work);
 }
@@ -239,7 +236,7 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   irq_work_queue(&mce

[PATCH v5] powerpc/mce: Avoid using irq_work_queue() in realmode

2022-01-20 Thread Ganesh Goudar
In realmode mce handler we use irq_work_queue() to defer
the processing of mce events, irq_work_queue() can only
be called when translation is enabled because it touches
memory outside RMA, hence we enable translation before
calling irq_work_queue and disable on return, though it
is not safe to do in realmode.

To avoid this, program the decrementer and call the event
processing functions from timer handler.

Signed-off-by: Ganesh Goudar 
---
V2:
* Use arch_irq_work_raise to raise decrementer interrupt.
* Avoid having atomic variable.

V3:
* Fix build error.
  Reported by kernel test bot.

V4:
* Rename some functions and variables
* Remove mces_to_process counter and add a flag to indicate
  there is a mce info to process.

V5:
* Fix the build warning, reported by kernel test robot.
---
 arch/powerpc/include/asm/machdep.h   |  2 +
 arch/powerpc/include/asm/mce.h   | 13 +
 arch/powerpc/include/asm/paca.h  |  1 +
 arch/powerpc/kernel/mce.c| 60 +---
 arch/powerpc/kernel/time.c   |  2 +
 arch/powerpc/platforms/pseries/pseries.h |  1 +
 arch/powerpc/platforms/pseries/ras.c | 32 +
 arch/powerpc/platforms/pseries/setup.c   |  1 +
 8 files changed, 53 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index e821037f74f0..36d2f34aa352 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -99,6 +99,8 @@ struct machdep_calls {
/* Called during machine check exception to retrive fixup address. */
bool(*mce_check_early_recovery)(struct pt_regs *regs);
 
+   void(*machine_check_log_err)(void);
+
/* Motherboard/chipset features. This is a kind of general purpose
 * hook used to control some machine specific features (like reset
 * lines, chip power control, etc...).
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 331d944280b8..c9f0936bd3c9 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -235,8 +235,21 @@ extern void machine_check_print_event_info(struct 
machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
  struct mce_error_info *mce_err);
+void mce_irq_work_queue(void);
 int mce_register_notifier(struct notifier_block *nb);
 int mce_unregister_notifier(struct notifier_block *nb);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void mce_run_irq_context_handlers(void);
+#else
+static inline void mce_run_irq_context_handlers(void) { };
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void set_mce_pending_irq_work(void);
+void clear_mce_pending_irq_work(void);
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 void flush_erat(void);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 295573a82c66..8330968ca346 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -288,6 +288,7 @@ struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct mce_info *mce_info;
+   u8 mce_pending_irq_work;
 #endif /* CONFIG_PPC_BOOK3S_64 */
 } cacheline_aligned;
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 2503dd4713b9..6cd4b1409874 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -28,19 +28,9 @@
 
 #include "setup.h"
 
-static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
-static struct irq_work mce_event_process_work = {
-.func = machine_check_process_queued_event,
-};
-
-static struct irq_work mce_ue_event_irq_work = {
-   .func = machine_check_ue_irq_work,
-};
-
 static DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
@@ -89,6 +79,13 @@ static void mce_set_error_info(struct machine_check_event 
*mce,
}
 }
 
+void mce_irq_work_queue(void)
+{
+   /* Raise decrementer interrupt */
+   arch_irq_work_raise();
+   set_mce_pending_irq_work();
+}
+
 /*
  * Decode and save high level MCE information into per cpu buffer which
  * is an array of machine_check_event structure.
@@ -217,7 +214,7 @@ void release_mce_event(void)
get_mce_event(NULL, true);
 }
 
-static void machine_check_ue_irq_work(struct irq_work *work)
+static void machine_check_ue_work(void)
 {
schedule_work(&mce_ue_event_work);
 }
@@ -239,7 +236,7 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process th

[PATCH] powerpc/eeh: avoid possible crash when edev->pdev changes

2024-05-27 Thread Ganesh Goudar
If a PCI device is removed during eeh_pe_report_edev(), edev->pdev
will change and can cause a crash, hold the PCI rescan/remove lock
while taking a copy of edev->pdev.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/eeh_pe.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index d1030bc52564..49f968733912 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -859,7 +859,9 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 
/* Retrieve the parent PCI bus of first (top) PCI device */
edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, entry);
+   pci_lock_rescan_remove();
pdev = eeh_dev_to_pci_dev(edev);
+   pci_unlock_rescan_remove();
if (pdev)
return pdev->bus;
 
-- 
2.44.0



[PATCH v2] powerpc/eeh: avoid possible crash when edev->pdev changes

2024-06-13 Thread Ganesh Goudar
If a PCI device is removed during eeh_pe_report_edev(), edev->pdev
will change and can cause a crash, hold the PCI rescan/remove lock
while taking a copy of edev->pdev.

Signed-off-by: Ganesh Goudar 
---
v2: Hold rescan lock till we get the bus address.
---
 arch/powerpc/kernel/eeh_pe.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index d1030bc52564..d283d281d28e 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -849,6 +849,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 {
struct eeh_dev *edev;
struct pci_dev *pdev;
+   struct pci_bus *bus = NULL;
 
if (pe->type & EEH_PE_PHB)
return pe->phb->bus;
@@ -859,9 +860,11 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 
/* Retrieve the parent PCI bus of first (top) PCI device */
edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, entry);
+   pci_lock_rescan_remove();
pdev = eeh_dev_to_pci_dev(edev);
if (pdev)
-   return pdev->bus;
+   bus = pdev->bus;
+   pci_unlock_rescan_remove();
 
-   return NULL;
+   return bus;
 }
-- 
2.44.0



[PATCH v2 1/1] powerpc/eeh: Enable PHBs to recovery in parallel

2024-06-17 Thread Ganesh Goudar
Currnetly, with a single event queue EEH recovery is entirely
serialized and takes place within a single kernel thread. This
can cause recovery to take a long time when there are many
devices.

Have the recovery event queue per PHB and allow the recovery to
happen in parallel for all the PHBs.

Signed-off-by: Ganesh Goudar 
---
v2: Include missing hunk, which modifies __eeh_send_failure_event.
---
 arch/powerpc/include/asm/eeh_event.h  |  7 
 arch/powerpc/include/asm/pci-bridge.h |  4 ++
 arch/powerpc/kernel/eeh_driver.c  | 27 +++-
 arch/powerpc/kernel/eeh_event.c   | 59 +--
 arch/powerpc/kernel/eeh_pe.c  |  4 ++
 5 files changed, 78 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index dadde7d52f46..6af1b5bb6103 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -8,6 +8,8 @@
 #define ASM_POWERPC_EEH_EVENT_H
 #ifdef __KERNEL__
 
+#include 
+
 /*
  * structure holding pci controller data that describes a
  * change in the isolation status of a PCI slot.  A pointer
@@ -15,15 +17,20 @@
  * callback.
  */
 struct eeh_event {
+   struct work_struct  work;
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
 };
 
+extern spinlock_t eeh_eventlist_lock;
+
 int eeh_event_init(void);
+int eeh_phb_event(struct eeh_pe *pe);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(struct eeh_pe *pe);
+void eeh_handle_normal_event_work(struct work_struct *work);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 2aa3a091ef20..61884d9398bf 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -138,6 +138,10 @@ struct pci_controller {
 
/* iommu_ops support */
struct iommu_device iommu;
+
+   bool eeh_in_progress;
+   struct list_head eeh_eventlist;
+   spinlock_t eeh_eventlist_lock;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7efe04c68f0f..4cf5fd409369 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -1116,6 +1116,30 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
 }
 
+void eeh_handle_normal_event_work(struct work_struct *work)
+{
+   unsigned long flags;
+   struct eeh_event *event = container_of(work, struct eeh_event, work);
+   struct pci_controller *phb = event->pe->phb;
+
+   eeh_handle_normal_event(event->pe);
+
+   kfree(event);
+   spin_lock_irqsave(&phb->eeh_eventlist_lock, flags);
+   WARN_ON_ONCE(!phb->eeh_in_progress);
+   if (list_empty(&phb->eeh_eventlist)) {
+   phb->eeh_in_progress = false;
+   pr_debug("EEH: No more work to do\n");
+   } else {
+   pr_warn("EEH: More work to do\n");
+   event = list_entry(phb->eeh_eventlist.next,
+  struct eeh_event, list);
+   list_del(&event->list);
+   queue_work(system_unbound_wq, &event->work);
+   }
+   spin_unlock_irqrestore(&phb->eeh_eventlist_lock, flags);
+}
+
 /**
  * eeh_handle_special_event - Handle EEH events without a specific failing PE
  *
@@ -1185,8 +1209,7 @@ void eeh_handle_special_event(void)
 */
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
rc == EEH_NEXT_ERR_FENCED_PHB) {
-   eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
-   eeh_handle_normal_event(pe);
+   eeh_phb_event(pe);
} else {
eeh_for_each_pe(pe, tmp_pe)
eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index c23a454af08a..8a9d6358d39f 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -22,7 +22,7 @@
  *  work-queue, where a worker thread can drive recovery.
  */
 
-static DEFINE_SPINLOCK(eeh_eventlist_lock);
+DEFINE_SPINLOCK(eeh_eventlist_lock);
 static DECLARE_COMPLETION(eeh_eventlist_event);
 static LIST_HEAD(eeh_eventlist);
 
@@ -91,6 +91,42 @@ int eeh_event_init(void)
return 0;
 }
 
+int eeh_phb_event(struct eeh_pe *pe)
+{
+   struct eeh_event *event;
+   unsigned long flags;
+   struct pci_controller *phb;
+
+   event = kzalloc(sizeof(*event), GFP_ATOMIC);
+   if (!event)
+   

[PATCH v2 0/1] Parallel EEH recovery between PHBs

2024-06-17 Thread Ganesh Goudar
This change is based on Sam Bobroff's patches which aimed
to allow recovery to happen in parallel between PHBs and PEs,
Due to various reasons the patches did not get in.

But having parallel recovery between PHBs is fairly simple and
gives significant improvement on powervm, Since powervm maintains
flat hierarchy for PCI devices.
This patch enables PHBs to have separate event queues and shorten
the time taken for EEH recovery by making the recovery to run in
parallel between PHBs.

On powervm with 64 VFs from same PHB,  I see approximately 48%
reduction in time taken in EEH recovery. On powernv the improvement
is not so significant.

Ganesh Goudar (1):
  powerpc/eeh: Enable PHBs to recovery in parallel

 arch/powerpc/include/asm/eeh_event.h  |  7 
 arch/powerpc/include/asm/pci-bridge.h |  4 ++
 arch/powerpc/kernel/eeh_driver.c  | 27 +++-
 arch/powerpc/kernel/eeh_event.c   | 59 +--
 arch/powerpc/kernel/eeh_pe.c  |  4 ++
 5 files changed, 78 insertions(+), 23 deletions(-)

-- 
2.44.0



[PATCH v3] powerpc/eeh: avoid possible crash when edev->pdev changes

2024-06-17 Thread Ganesh Goudar
If a PCI device is removed during eeh_pe_report_edev(), edev->pdev
will change and can cause a crash, hold the PCI rescan/remove lock
while taking a copy of edev->pdev->bus.

Signed-off-by: Ganesh Goudar 
---
v2: Hold rescan lock till we get the bus address.
v3: Now that we are taking copy of bus, holding the lock, update the
commit message accordingly.
---
 arch/powerpc/kernel/eeh_pe.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index d1030bc52564..d283d281d28e 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -849,6 +849,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 {
struct eeh_dev *edev;
struct pci_dev *pdev;
+   struct pci_bus *bus = NULL;
 
if (pe->type & EEH_PE_PHB)
return pe->phb->bus;
@@ -859,9 +860,11 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 
/* Retrieve the parent PCI bus of first (top) PCI device */
edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, entry);
+   pci_lock_rescan_remove();
pdev = eeh_dev_to_pci_dev(edev);
if (pdev)
-   return pdev->bus;
+   bus = pdev->bus;
+   pci_unlock_rescan_remove();
 
-   return NULL;
+   return bus;
 }
-- 
2.44.0



[RFC 2/3] powerpc/eeh: Provide a unique ID for each EEH recovery

2022-08-15 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

Give a unique ID to each recovery event, to ease log parsing
and prepare for parallel recovery.

Also add some new messages with a very simple format that may
be useful to log-parsers.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh_event.h |   3 +-
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c|  42 +++---
 arch/powerpc/kernel/eeh_driver.c | 188 ---
 arch/powerpc/kernel/eeh_event.c  |  12 +-
 5 files changed, 146 insertions(+), 101 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index dadde7d52f46..a1fe736bc4cf 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -17,13 +17,14 @@
 struct eeh_event {
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
+   unsigned intid; /* Event ID */
 };
 
 int eeh_event_init(void);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
-void eeh_handle_normal_event(struct eeh_pe *pe);
+void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/ppc-pci.h 
b/arch/powerpc/include/asm/ppc-pci.h
index f6cf0159024e..42d175af33cb 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -40,7 +40,7 @@ extern int rtas_setup_phb(struct pci_controller *phb);
 void eeh_addr_cache_insert_dev(struct pci_dev *dev);
 void eeh_addr_cache_rmv_dev(struct pci_dev *dev);
 struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr);
-void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
+void eeh_slot_error_detail(unsigned int event_id, struct eeh_pe *pe, int 
severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
 int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed);
 void eeh_save_bars(struct eeh_dev *edev);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 2c90c37524ed..148d5df0e606 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -200,7 +200,8 @@ EXPORT_SYMBOL_GPL(eeh_recovery_must_be_locked);
  * for the indicated PCI device, and puts them into a buffer
  * for RTAS error logging.
  */
-static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
+static size_t eeh_dump_dev_log(unsigned int event_id, struct eeh_dev *edev,
+  char *buf, size_t len)
 {
u32 cfg;
int cap, i;
@@ -210,27 +211,29 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
-   pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
+   pr_warn("EEH(%u): of node=%04x:%02x:%02x.%01x\n",
+   event_id,
edev->pe->phb->global_number, edev->bdfn >> 8,
PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
 
eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, &cfg);
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
-   pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI device/vendor: %08x\n",event_id, cfg);
 
eeh_ops->read_config(edev, PCI_COMMAND, 4, &cfg);
n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
-   pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
+   pr_warn("EEH(%u): PCI cmd/status register: %08x\n", event_id, cfg);
 
/* Gather bridge-specific registers */
if (edev->mode & EEH_DEV_BRIDGE) {
eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, &cfg);
n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
-   pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge secondary status: %04x\n",
+   event_id, cfg);
 
eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &cfg);
n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
-   pr_warn("EEH: Bridge control: %04x\n", cfg);
+   pr_warn("EEH(%u): Bridge control: %04x\n", event_id, cfg);
}
 
/* Dump out the PCI-X command and status regs */
@@ -238,18 +241,19 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
if (cap) {
eeh_ops->read_config(edev, cap, 4, &cfg);
  

[RFC 0/3] Asynchronous EEH recovery

2022-08-15 Thread Ganesh Goudar
Hi,

EEH reocvery is currently serialized and these patches shorten
the time taken for EEH recovery by making the recovery to run
in parallel. The original author of these patches is Sam Bobroff,
I have rebased and tested these patches.

On powervm with 64 VFs and I see approximately 48% reduction
in time taken in EEH recovery, Yet to be tested on powernv.

These patches were originally posted as separate RFCs, I think
posting them as single series would be more helpful, I know the
patches are too big, I will try to logically divide in next
iterations.

Thanks 

Ganesh Goudar (3):
  powerpc/eeh: Synchronization for safety
  powerpc/eeh: Provide a unique ID for each EEH recovery
  powerpc/eeh: Asynchronous recovery

 arch/powerpc/include/asm/eeh.h   |   7 +-
 arch/powerpc/include/asm/eeh_event.h |  10 +-
 arch/powerpc/include/asm/pci-bridge.h|   3 +
 arch/powerpc/include/asm/ppc-pci.h   |   2 +-
 arch/powerpc/kernel/eeh.c| 154 +++--
 arch/powerpc/kernel/eeh_driver.c | 578 +++
 arch/powerpc/kernel/eeh_event.c  |  71 ++-
 arch/powerpc/kernel/eeh_pe.c |  33 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_spapr_eeh.c|  10 +-
 14 files changed, 685 insertions(+), 212 deletions(-)

-- 
2.37.1



[RFC 1/3] powerpc/eeh: Synchronization for safety

2022-08-15 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

There is currently little synchronization between EEH error detection
(eeh_dev_check_failure()), EEH error recovery
(eeh_handle_{normal,special}_event()) and the PCI subsystem (device
addition and removal), and so there are race conditions that lead to
crashes (often access to free'd memory or LIST_POISON).

However, a solution must consider:
- EEH error detection can occur in interrupt context, which prevents
the use of a mutex.
- EEH recovery may need to sleep, which prevents the use of a spinlock.
- EEH recovery uses PCI operations that may require the PCI
rescan/remove lock and/or device lock to be held
- PCI operations may hold the rescan/remove and/or device lock when
calling into EEH functions.
- Device driver callbacks may perform arbitrary PCI operations
during recovery, including device removal.

In this patch the existing mutex and spinlock are combined with the
EEH_PE_RECOVERING flag to provide some assurances that are then used
to reduce the race conditions.

The fields to be protected are the ones that provide the structure
of the trees of struct eeh_pe that are held for each PHB: the parent
pointer and child lists and the list of struct eeh_dev, as well as
the pe and pdev pointers within struct eeh_dev.

The existing way of using EEH_PE_RECOVERING is kept and slightly
extended: No struct eeh_pe will be removed while it has the flag set
on it. Additionally, when adding new PEs, they are marked
EEH_PE_RECOVERING if their parent PE is marked: this allows the
recovery thread to assume that all PEs underneath the one it's
processing will continue to exist during recovery.

Both the mutex and spinlock are held while any protected field is
changed or a PE is deleted, so holding either of them (elsewhere) will
keep them stable and safe to access. Additionally, if
EEH_PE_RECOVERING is set on a PE then the locks can be released and
re-acquired safely, as long as the protected fields aren't used while
no locks are held. This is used during recovery to release locks
for long sleeps (i.e. during eeh_wait_state() when we may sleep up to
5 minutes), or to maintain lock ordering.

The spinlock is used in error detection (which cannot use a mutex, see
above) and also where it's possible that the mutex is already held.
The mutex is used in areas that don't have that restriction, and where
blocking may be required. Care must be taken when ordering these locks
against the PCI rescan/remove lock and the device locks to avoid
deadlocking.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h   |   6 +-
 arch/powerpc/kernel/eeh.c| 112 ++--
 arch/powerpc/kernel/eeh_driver.c | 287 ++-
 arch/powerpc/kernel/eeh_pe.c |  30 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c |  12 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   5 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c   |   5 +-
 drivers/pci/hotplug/pnv_php.c|   5 +-
 drivers/pci/hotplug/rpadlpar_core.c  |   2 +
 drivers/vfio/vfio_spapr_eeh.c|  10 +-
 10 files changed, 364 insertions(+), 110 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 514dd056c2c8..f659c0433de5 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -271,11 +271,15 @@ static inline bool eeh_state_active(int state)
== (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 }
 
+void eeh_recovery_lock(void);
+void eeh_recovery_unlock(void);
+void eeh_recovery_must_be_locked(void);
+
 typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
 typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
 void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
-int eeh_wait_state(struct eeh_pe *pe, int max_wait);
+int eeh_wait_state(struct eeh_pe *pe, int max_waiti, bool unlock);
 struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
 struct eeh_pe *eeh_pe_get(struct pci_controller *phb, int pe_no);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ab316e155ea9..2c90c37524ed 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -108,7 +108,25 @@ bool eeh_debugfs_no_recover;
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
 
-/* Lock to avoid races due to multiple reports of an error */
+/*
+ * confirm_error_lock and eeh_dev_mutex are used together to provide
+ * safety during EEH operations.
+ *
+ * Generally, the spinlock is used in error detection where it's not possible
+ * to use a mutex or where there is potential to deadlock with the mutex, and
+ * the mutex is used during recovery and other PCI related operations. One must
+ * be held when reading and both must be held when making changes to the
+ * p

[RFC 3/3] powerpc/eeh: Asynchronous recovery

2022-08-15 Thread Ganesh Goudar
Based on the original work from Sam Bobroff.

Currently, EEH recovery is entirely serialized and takes place within
a single kernel thread. This can cause recovery to take a long time
when there are many devices.

To shorten recovery time, this change allows recovery to proceed in
parallel in two ways:
- Each PHB is given it's own recovery event queue and can be recovered
independently from other PHBs.
- Driver handlers are called in parallel, but with the constraint that
handlers higher up (closer to the PHB) in the PE hierarchy must be
called before those lower down.

To maintain the constraint, above, the driver handlers are called by
traversing the tree of affected PEs from the top, stopping to call
handlers (in parallel) when a PE with devices is discovered. When the
calls for that PE are complete, traversal continues at each child PE.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/include/asm/eeh.h|   1 +
 arch/powerpc/include/asm/eeh_event.h  |   7 +
 arch/powerpc/include/asm/pci-bridge.h |   3 +
 arch/powerpc/kernel/eeh_driver.c  | 323 +++---
 arch/powerpc/kernel/eeh_event.c   |  65 +++---
 arch/powerpc/kernel/eeh_pe.c  |   3 +
 6 files changed, 288 insertions(+), 114 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index f659c0433de5..2728aee5cb0b 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -128,6 +128,7 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 #define EEH_DEV_NO_HANDLER (1 << 8)/* No error handler */
 #define EEH_DEV_SYSFS  (1 << 9)/* Sysfs created*/
 #define EEH_DEV_REMOVED(1 << 10)   /* Removed permanently  
*/
+#define EEH_DEV_RECOVERING (1 << 11)   /* Recovering   */
 
 struct eeh_dev {
int mode;   /* EEH mode */
diff --git a/arch/powerpc/include/asm/eeh_event.h 
b/arch/powerpc/include/asm/eeh_event.h
index a1fe736bc4cf..b21f49e87b7b 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -8,6 +8,8 @@
 #define ASM_POWERPC_EEH_EVENT_H
 #ifdef __KERNEL__
 
+#include 
+
 /*
  * structure holding pci controller data that describes a
  * change in the isolation status of a PCI slot.  A pointer
@@ -15,16 +17,21 @@
  * callback.
  */
 struct eeh_event {
+   struct work_struct  work;
struct list_headlist;   /* to form event queue  */
struct eeh_pe   *pe;/* EEH PE   */
unsigned intid; /* Event ID */
 };
 
+extern spinlock_t eeh_eventlist_lock;
+
 int eeh_event_init(void);
+int eeh_phb_event(struct eeh_pe *pe);
 int eeh_send_failure_event(struct eeh_pe *pe);
 int __eeh_send_failure_event(struct eeh_pe *pe);
 void eeh_remove_event(struct eeh_pe *pe, bool force);
 void eeh_handle_normal_event(unsigned int event_id, struct eeh_pe *pe);
+void eeh_handle_normal_event_work(struct work_struct *work);
 void eeh_handle_special_event(void);
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index c85f901227c9..74806009f50a 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -131,6 +131,9 @@ struct pci_controller {
struct irq_domain   *dev_domain;
struct irq_domain   *msi_domain;
struct fwnode_handle*fwnode;
+
+   bool eeh_in_progress;
+   struct list_head eeh_eventlist;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 894326cc4dfa..3abd5f2d146c 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -12,12 +12,17 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 
+static atomic_t eeh_wu_id = ATOMIC_INIT(0);
+
 struct eeh_rmv_data {
struct list_head removed_vf_list;
int removed_dev_count;
@@ -248,73 +253,59 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool 
enable)
 }
 
 typedef enum pci_ers_result (*eeh_report_fn)(unsigned int event_id,
+unsigned int id,
 struct pci_dev *,
 struct pci_driver *);
 static void eeh_pe_report_pdev(unsigned int event_id,
-  struct pci_dev *pdev, eeh_report_fn fn,
+  unsigned int id,
+  struct pci_dev *pdev,
+  const char *fn_name, eeh_report_fn fn,
   enum pci_ers_result *result,
-  const char *handler_name)
+  bool late, bool removed, bool passed)
 {

[PATCH] powerpc/eeh: parse AER registers

2025-07-02 Thread Ganesh Goudar
parse AER uncorrectable and correctable error status
registers to print error type and severity.

output looks like
EEH:AER Uncorrectable Error
EEH:AER Error Type: Data Link Protocol Error [Fatal]

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/eeh.c | 84 ++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 83fe99861eb1..03e1e2eeb679 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -139,6 +139,49 @@ struct eeh_stats {
 
 static struct eeh_stats eeh_stats;
 
+static const char * const aer_uncor_errors[] = {
+   "Undefined",
+   "Undefined",
+   "Undefined",
+   "Undefined",
+   "Data Link Protocol",
+   "Surprise Down",
+   "Poisoned TLP",
+   "Flow Control Protocol",
+   "Completion Timeout",
+   "Completer Abort",
+   "Unexpected Completion",
+   "Receiver Overflow",
+   "Malformed TLP",
+   "ECRC Error",
+   "Unsupported Request",
+   "ACS Violation",
+   "Uncorrectable Internal Error",
+   "MC Blocked TLP",
+   "AtomicOp Egress Blocked",
+   "TLPPrefix Blocked",
+   "Poisoned TLP Egress Blocked"
+};
+
+static const char * const aer_cor_errors[] = {
+   "Receiver Error",
+   "Undefined",
+   "Undefined",
+   "Undefined",
+   "Undefined",
+   "Undefined",
+   "Bad TLP",
+   "Bad DLLP",
+   "Replay Num Rollover",
+   "Undefined",
+   "Undefined",
+   "Undefined",
+   "Replay Timer Timeout",
+   "Advisory Non-Fatal Error",
+   "Corrected Internal Error",
+   "Header Log Overflow",
+};
+
 static int __init eeh_setup(char *str)
 {
if (!strcmp(str, "off"))
@@ -160,6 +203,43 @@ void eeh_show_enabled(void)
pr_info("EEH: No capable adapters found: recovery disabled.\n");
 }
 
+static void eeh_parse_aer_registers(struct eeh_dev *edev, int cap)
+{
+   int i;
+   const char *error_type;
+   u32 uncor_status, uncor_severity, cor_status;
+
+   eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_STATUS, 4, 
&uncor_status);
+   eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_SEVER, 4, 
&uncor_severity);
+   eeh_ops->read_config(edev, cap + PCI_ERR_COR_STATUS, 4, &cor_status);
+
+   if (!uncor_status && !cor_status)
+   return;
+
+   if (uncor_status) {
+   pr_err("EEH:AER Uncorrectable Error\n");
+   for (i = 0; i < ARRAY_SIZE(aer_uncor_errors); i++) {
+   if (uncor_status & (1 << i)) {
+   error_type = (i < ARRAY_SIZE(aer_uncor_errors))
+? aer_uncor_errors[i] : "Unknown";
+   pr_err("EEH:AER Error Type: %s [%s]\n", 
error_type,
+  (uncor_severity & (1 << i)) ? "Fatal" : 
"Non-Fatal");
+   }
+   }
+   }
+
+   if (cor_status) {
+   pr_err("EEH:AER Correctable Error\n");
+   for (i = 0; i < ARRAY_SIZE(aer_cor_errors); i++) {
+   if (cor_status & (1 << i)) {
+   error_type = (i < ARRAY_SIZE(aer_cor_errors))
+ ? aer_cor_errors[i] : "Unknown";
+   pr_err("EEH:AER Error Type: %s\n", error_type);
+   }
+   }
+   }
+}
+
 /*
  * This routine captures assorted PCI configuration space data
  * for the indicated PCI device, and puts them into a buffer
@@ -237,9 +317,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
pr_warn("%s\n", buffer);
}
 
-   /* If AER capable, dump it */
+   /* If AER capable, parse and dump it */
cap = edev->aer_cap;
if (cap) {
+   eeh_parse_aer_registers(edev, cap);
+
n += scnprintf(buf+n, len-n, "pci-e AER:\n");
pr_warn("EEH: PCI-E AER capability register set follows:\n");
 
-- 
2.48.1