------- Comment From sudeeshj...@in.ibm.com 2017-12-04 03:43 EDT-------
The reported issue is no more seen with the given kernel.

root@ltc84-pkvm1:~# lspci | grep acc
0001:01:00.0 Processing accelerators: IBM Device 0477 (rev 01)
0002:00:00.0 Processing accelerators: IBM Device 4350 (rev 0a)
root@ltc84-pkvm1:~#
root@ltc84-pkvm1:~#
root@ltc84-pkvm1:~# echo 10000 > /sys/kernel/debug/powerpc/eeh_max_freezes
root@ltc84-pkvm1:~# echo 1 > /sys/class/cxl/card0/perst_reloads_same_image
root@ltc84-pkvm1:~# echo 0x8000000000000000 > 
/sys/kernel/debug/powerpc/PCI0001/err_injct_outbound
root@ltc84-pkvm1:~#
root@ltc84-pkvm1:~#
root@ltc84-pkvm1:~#
root@ltc84-pkvm1:~# echo 0x8000000000000000 > 
/sys/kernel/debug/powerpc/PCI0001/err_injct_outbound
root@ltc84-pkvm1:~# dpkg -l | grep linux-im
rc  linux-image-4.10.0-26-generic       4.10.0-26.30~lp1694485            
ppc64el      Linux kernel image for version 4.10.0 on PowerPC 64el SMP
ii  linux-image-4.10.0-40-generic       4.10.0-40.44~lp1694485            
ppc64el      Linux kernel image for version 4.10.0 on PowerPC 64el SMP
rc  linux-image-extra-4.10.0-26-generic 4.10.0-26.30~lp1694485            
ppc64el      Linux kernel extra modules for version 4.10.0 on PowerPC 64el SMP
rc  linux-image-extra-4.10.0-40-generic 4.10.0-40.44~lp1694485            
ppc64el      Linux kernel extra modules for version 4.10.0 on PowerPC 64el SMP
root@ltc84-pkvm1:~# uname -a
Linux ltc84-pkvm1 4.10.0-40-generic #44~lp1694485 SMP Sat Dec 2 20:43:42 UTC 
2017 ppc64le ppc64le ppc64le GNU/Linux
root@ltc84-pkvm1:~#

root@ltc84-pkvm1:~# dmesg
[  115.720740] Harmless Hypervisor Maintenance interrupt [Recovered]
[  115.720747] EEH: Fenced PHB#1 detected, location: N/A
[  115.721905] EEH: This PCI device has failed 1 times in the last hour
[  115.721906] EEH: Notify device drivers to shutdown
[  115.721916] cxl afu0.0: Deactivating AFU directed mode
[  115.722170] cxl afu0.0: PSL Purge called with link down, ignoring
[  115.722585]  Error detail: Unknown
[  115.722586]  HMER: 8040000000000000
[  115.722588] Harmless Hypervisor Maintenance interrupt [Recovered]
[  115.722588]  Error detail: Unknown
[  115.722589]  HMER: 8040000000000000
[  115.722682] EEH: Collect temporary log
[  115.722684] PHB3 PHB#1 Diag-data (Version: 1)
[  115.722686] brdgCtl:     0000ffff
[  115.722687] UtlSts:      00200000 00000000 00000000
[  115.722689] RootSts:     ffffffff ffffffff ffffffff ffffffff 0000ffff
[  115.722690] RootErrSts:  ffffffff ffffffff ffffffff
[  115.722691] RootErrLog:  ffffffff ffffffff ffffffff ffffffff
[  115.722693] RootErrLog1: ffffffff 0000000000000000 0000000000000000
[  115.722694] nFir:        0000809000000000 0030006e00000000 0000800000000000
[  115.722695] PhbSts:      0000001800000000 0000001800000000
[  115.722697] Lem:         8000020000800000 40018e2400022482 8000000000000000
[  115.722699] OutErr:      8000002000000000 8000000000000000 1210066000020003 
0000c00000000000
[  115.722700] InBErr:      0000000040000000 0000000040000000 0000080000000000 
000c104010010000
[  115.722702] EEH: Reset without hotplug activity
[  120.232880] EEH: Notify device drivers the completion of reset
[  120.232891] cxl-pci 0001:01:00.0: enabling device (0140 -> 0142)
[  120.233197] pci 0001:01     : [PE# 00] Switching PHB to CXL
[  120.233301] pci 0001:01     : [PE# 00] Switching PHB to CXL
[  120.244892] cxl afu0.0: Activating AFU directed mode
[  120.245015] EEH: Notify device driver to resume
[  710.830994] Harmless Hypervisor Maintenance interrupt [Recovered]
[  710.831000]  Error detail: Unknown
[  710.831003]  HMER: 8040000000000000
[  710.831006] Harmless Hypervisor Maintenance interrupt [Recovered]
[  710.831008]  Error detail: Unknown
[  710.831011]  HMER: 8040000000000000
[  710.831522] EEH: Fenced PHB#1 detected, location: N/A
[  710.833355] EEH: This PCI device has failed 2 times in the last hour
[  710.833356] EEH: Notify device drivers to shutdown
[  710.833364] cxl afu0.0: Deactivating AFU directed mode
[  710.833503] cxl afu0.0: PSL Purge called with link down, ignoring
[  710.833698] EEH: Collect temporary log
[  710.833701] PHB3 PHB#1 Diag-data (Version: 1)
[  710.833703] brdgCtl:     0000ffff
[  710.833704] UtlSts:      00200000 00000000 00000000
[  710.833706] RootSts:     ffffffff ffffffff ffffffff ffffffff 0000ffff
[  710.833708] RootErrSts:  ffffffff ffffffff ffffffff
[  710.833709] RootErrLog:  ffffffff ffffffff ffffffff ffffffff
[  710.833710] RootErrLog1: ffffffff 0000000000000000 0000000000000000
[  710.833712] nFir:        0000809000000000 0030006e00000000 0000800000000000
[  710.833713] PhbSts:      0000001800000000 0000001800000000
[  710.833715] Lem:         8000020000000000 40018e2400022482 8000000000000000
[  710.833717] OutErr:      8000002000000000 8000000000000000 1210046000020003 
0000800000000000
[  710.833719] EEH: Reset without hotplug activity
[  715.319705] EEH: Notify device drivers the completion of reset
[  715.319715] cxl-pci 0001:01:00.0: enabling device (0140 -> 0142)
[  715.320017] pci 0001:01     : [PE# 00] Switching PHB to CXL
[  715.320117] pci 0001:01     : [PE# 00] Switching PHB to CXL
[  715.331720] cxl afu0.0: Activating AFU directed mode
[  715.331838] EEH: Notify device driver to resume
root@ltc84-pkvm1:~#

-- 
You received this bug notification because you are a member of Kernel
Packages, which is subscribed to linux in Ubuntu.
https://bugs.launchpad.net/bugs/1694485

Title:
  Ubuntu17.04: CAPI: call trace seen while error injection to  the CAPI
  card.

Status in The Ubuntu-power-systems project:
  Incomplete
Status in linux package in Ubuntu:
  Incomplete
Status in linux source package in Zesty:
  In Progress

Bug description:
  == Comment: #0 - SUDEESH JOHN  - 2017-03-18 13:55:03 ==
  ---Problem Description---
  call trace while injecting error to the CAPI card.

  " WARNING: CPU: 31 PID: 491 at 
/build/linux-VtwHOM/linux-4.10.0/drivers/misc/cxl/main.c:325 
cxl_adapter_context_unlock+0x68/0x90 [cxl] " 
    
  ---uname output---
  Linux freak 4.10.0-13-generic #15-Ubuntu SMP Thu Mar 9 20:27:28 UTC 2017 
ppc64le ppc64le ppc64le GNU/Linux
   
  Machine Type = PowerNV 8247-21L 
    
  ---Steps to Reproduce---
  1. echo 10000 > /sys/kernel/debug/powerpc/eeh_max_freezes
  2. echo 1 > /sys/class/cxl/card0/perst_reloads_same_image
  3.  echo 0x8000000000000000 > 
/sys/kernel/debug/powerpc/PCI0000/err_injct_outbound
   
  ---The complete call trace ---

  Mar 18 14:39:09 freak kernel: [  289.675421] ------------[ cut here 
]------------
  Mar 18 14:39:09 freak kernel: [  289.675431] WARNING: CPU: 5 PID: 491 at 
/build/linux-VtwHOM/linux-4.10.0/drivers/misc/cxl/main.c:325 
cxl_adapter_context_unlock+0x68/0x90 [cxl]
  Mar 18 14:39:09 freak kernel: [  289.675432] Modules linked in: xt_CHECKSUM 
iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 
nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT 
nf_reject_ipv4 xt_tcpudp bridge stp llc kvm_hv kvm_pr kvm ebtable_filter 
ebtables ip6table_filter ip6_tables iptable_filter uio_pdrv_genirq uio 
ipmi_powernv ipmi_devintf ipmi_msghandler powernv_op_panel powernv_rng 
vmx_crypto ibmpowernv leds_powernv ib_iser rdma_cm iw_cm ib_cm ib_core configfs 
iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ip_tables x_tables autofs4 
btrfs raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx 
xor raid6_pq raid1 raid0 multipath linear ses enclosure scsi_transport_sas 
bnx2x mlx5_core tg3 cxl mdio ipr libcrc32c devlink crc32c_vpmsum pnv_php
  Mar 18 14:39:09 freak kernel: [  289.675490] CPU: 5 PID: 491 Comm: eehd Not 
tainted 4.10.0-13-generic #15-Ubuntu
  Mar 18 14:39:09 freak kernel: [  289.675492] task: c0000003bfbfde00 
task.stack: c0000003bfc5c000
  Mar 18 14:39:09 freak kernel: [  289.675493] NIP: d000000005cc0ca0 LR: 
d000000005cc0c9c CTR: c000000000605aa0
  Mar 18 14:39:09 freak kernel: [  289.675495] REGS: c0000003bfc5f6a0 TRAP: 
0700   Not tainted  (4.10.0-13-generic)
  Mar 18 14:39:09 freak kernel: [  289.675496] MSR: 900000000282b033 
<SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>
  Mar 18 14:39:09 freak kernel: [  289.675504]   CR: 28008282  XER: 20000000
  Mar 18 14:39:09 freak kernel: [  289.675504] CFAR: c000000000b568dc SOFTE: 1
  Mar 18 14:39:09 freak kernel: [  289.675504] GPR00: d000000005cc0c9c 
c0000003bfc5f920 d000000005cf2d88 000000000000002f
  Mar 18 14:39:09 freak kernel: [  289.675504] GPR04: 0000000000000001 
00000000000003fd 0000000063206576 0000000000000000
  Mar 18 14:39:09 freak kernel: [  289.675504] GPR08: c0000000015dc700 
0000000000000000 0000000000000000 0000000000000001
  Mar 18 14:39:09 freak kernel: [  289.675504] GPR12: 0000000000008800 
c00000000fb82d00 c000000000108c88 c0000003c51f9f00
  Mar 18 14:39:09 freak kernel: [  289.675504] GPR16: 0000000000000000 
0000000000000000 0000000000000000 0000000000000000
  Mar 18 14:39:09 freak kernel: [  289.675504] GPR20: 0000000000000000 
0000000000000000 0000000000000000 c000000000d53990
  Mar 18 14:39:09 freak kernel: [  289.675504] GPR24: c000000000d53968 
c0000000014a4330 c0000003ab8fa800 c0000003bd2c20c0
  Mar 18 14:39:09 freak kernel: [  289.675504] GPR28: c0000003c5051098 
0000000000000000 c0000003ab8fa800 0000000000000000
  Mar 18 14:39:09 freak kernel: [  289.675535] NIP [d000000005cc0ca0] 
cxl_adapter_context_unlock+0x68/0x90 [cxl]
  Mar 18 14:39:09 freak kernel: [  289.675540] LR [d000000005cc0c9c] 
cxl_adapter_context_unlock+0x64/0x90 [cxl]
  Mar 18 14:39:09 freak kernel: [  289.675541] Call Trace:
  Mar 18 14:39:09 freak kernel: [  289.675547] [c0000003bfc5f920] 
[d000000005cc0c9c] cxl_adapter_context_unlock+0x64/0x90 [cxl] (unreliable)
  Mar 18 14:39:09 freak kernel: [  289.675556] [c0000003bfc5f980] 
[d000000005cd022c] cxl_configure_adapter+0x954/0x990 [cxl]
  Mar 18 14:39:09 freak kernel: [  289.675563] [c0000003bfc5fa30] 
[d000000005cd02c0] cxl_pci_slot_reset+0x58/0x240 [cxl]
  Mar 18 14:39:09 freak kernel: [  289.675568] [c0000003bfc5fae0] 
[c00000000003b0d4] eeh_report_reset+0x154/0x190
  Mar 18 14:39:09 freak kernel: [  289.675571] [c0000003bfc5fb20] 
[c000000000039428] eeh_pe_dev_traverse+0x98/0x170
  Mar 18 14:39:09 freak kernel: [  289.675574] [c0000003bfc5fbb0] 
[c00000000003b81c] eeh_handle_normal_event+0x3ec/0x540
  Mar 18 14:39:09 freak kernel: [  289.675577] [c0000003bfc5fc60] 
[c00000000003bbd4] eeh_handle_event+0x174/0x360
  Mar 18 14:39:09 freak kernel: [  289.675580] [c0000003bfc5fd10] 
[c00000000003bfa8] eeh_event_handler+0x1e8/0x1f0
  Mar 18 14:39:09 freak kernel: [  289.675583] [c0000003bfc5fdc0] 
[c000000000108dd4] kthread+0x154/0x1a0
  Mar 18 14:39:09 freak kernel: [  289.675586] [c0000003bfc5fe30] 
[c00000000000b4e8] ret_from_kernel_thread+0x5c/0x74
  Mar 18 14:39:09 freak kernel: [  289.675588] Instruction dump:
  Mar 18 14:39:09 freak kernel: [  289.675590] 2f84ffff 4d9e0020 7c0802a6 
f8010010 f821ffa1 39200000 7c8407b4 912303d0
  Mar 18 14:39:09 freak kernel: [  289.675596] 3d220000 e8698070 4801f159 
e8410018 <0fe00000> 38210060 e8010010 7c0803a6
  Mar 18 14:39:09 freak kernel: [  289.675602] ---[ end trace 113989c345fee0d3 
]---
  Mar 18 14:39:09 freak kernel: [  289.675642] cxl afu0.0: Activating AFU 
directed mode

  == Comment: #2 - Vaibhav Jain - 2017-03-20 05:00:20 ==
  Have sent a fix patch to ppc-dev list for review 
https://patchwork.ozlabs.org/patch/740876/

  == Comment: #3 - Vaibhav Jain - 2017-05-16 01:56:32 ==
  Patch merged to main line viz commit ea9a26d117cf0637c71d3e0076f4a124bf5859df 
('cxl: Force context lock during EEH flow')

To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu-power-systems/+bug/1694485/+subscriptions

-- 
Mailing list: https://launchpad.net/~kernel-packages
Post to     : kernel-packages@lists.launchpad.net
Unsubscribe : https://launchpad.net/~kernel-packages
More help   : https://help.launchpad.net/ListHelp

Reply via email to