** Description changed:

+ [Impact]
+ Latest kexec-tools is needed to load/kexec recent kernels. For older 
releases, like xenial, it's needed to support linux-hwe kernels.
+ 
+ [Regression Potential]
+ It might fail to load the GA kernels, like a 4.4 kernel on xenial.
+ 
+ [Test case]
+ Different kernels on different architectures have been tested.
+ 
+ 
  == Comment: #0 - INDIRA P. JOGA
  Problem Description:
  ===================
  System hung with kernel panic Kernel panic - not syncing: Out of memory 
message when crash is triggered
  
  Steps to re-create:
  ==================
  > Installed ubuntu1804 daily build on Witherspoon test system
  root@whip:~# uname -a
  Linux whip 4.13.0-17-generic #20-Ubuntu SMP Mon Nov 6 10:03:08 UTC 2017 
ppc64le ppc64le ppc64le GNU/Linux
  root@whip:~# uname -r
  4.13.0-17-generic
  
  > root@whip:~# free -h
-               total        used        free      shared  buff/cache   
available
+               total        used        free      shared  buff/cache   
available
  Mem:           507G        2.0G        504G         19M        728M        
503G
  Swap:          2.0G          0B        2.0G
  
- 
- > Edited the grub /etc/default/grub.d/kexec-tools.cfg file and set the crash 
kernel parameter=4096M
+ > Edited the grub /etc/default/grub.d/kexec-tools.cfg file and set the
+ crash kernel parameter=4096M
  
  > Updated grub using update-grub command and reboot system.
  
  cat root@whip:~# cat /proc/cmdline
  root=UUID=46c6aa02-8215-44cc-b3fc-0bc79c3c8815 ro splash quiet 
crashkernel=4096M
  
  > kdump status before triggering crash
  
  root@whip:~# kdump-config show
  DUMP_MODE:        kdump
  USE_KDUMP:        1
  KDUMP_SYSCTL:     kernel.panic_on_oops=1
  KDUMP_COREDIR:    /var/crash
- crashkernel addr: 
-    /var/lib/kdump/vmlinuz: symbolic link to /boot/vmlinux-4.13.0-17-generic
- kdump initrd: 
-    /var/lib/kdump/initrd.img: symbolic link to 
/var/lib/kdump/initrd.img-4.13.0-17-generic
+ crashkernel addr:
+    /var/lib/kdump/vmlinuz: symbolic link to /boot/vmlinux-4.13.0-17-generic
+ kdump initrd:
+    /var/lib/kdump/initrd.img: symbolic link to 
/var/lib/kdump/initrd.img-4.13.0-17-generic
  current state:    ready to kdump
  
  kexec command:
-   /sbin/kexec -p 
--command-line="root=UUID=46c6aa02-8215-44cc-b3fc-0bc79c3c8815 ro splash quiet 
irqpoll noirqdistrib nr_cpus=1 nousb systemd.unit=kdump-tools.service 
ata_piix.prefer_ms_hyperv=0" --initrd=/var/lib/kdump/initrd.img 
/var/lib/kdump/vmlinuz
+   /sbin/kexec -p 
--command-line="root=UUID=46c6aa02-8215-44cc-b3fc-0bc79c3c8815 ro splash quiet 
irqpoll noirqdistrib nr_cpus=1 nousb systemd.unit=kdump-tools.service 
ata_piix.prefer_ms_hyperv=0" --initrd=/var/lib/kdump/initrd.img 
/var/lib/kdump/vmlinuz
  
  root@whip:~# kdump-config status
  current state   : ready to kdump
  
  >  Enabled sysrq
  root@whip:~# sysctl -w kernel.sysrq=1
  kernel.sysrq = 1
  
  > Triggered crash and it hangs with kernel panic- OOM message as below
  
  root@whip:~# echo c > /proc/sysrq-trigger
  [   85.731415] sysrq: SysRq : Trigger a crash
  [   85.731472] Unable to handle kernel paging request for data at address 
0x00000000
  [   85.731584] Faulting instruction address: 0xc00000000078f588
  [   85.731670] Oops: Kernel access of bad area, sig: 11 [#1]
- [   85.731744] SMP NR_CPUS=2048 
- [   85.731745] NUMA 
+ [   85.731744] SMP NR_CPUS=2048
+ [   85.731745] NUMA
  [   85.731790] PowerNV
  [   85.731853] Modules linked in: rpcsec_gss_krb5 nfsv4 nfs fscache sctp_diag 
sctp dccp_diag dccp tcp_diag udp_diag raw_diag inet_diag unix_diag 
af_packet_diag netlink_diag binfmt_misc vmx_crypto crct10dif_vpmsum ofpart 
cmdlinepart idt_89hpesx powernv_flash ipmi_powernv opal_prd ibmpowernv mtd 
ipmi_devintf ipmi_msghandler at24 uio_pdrv_genirq uio dm_multipath scsi_dh_rdac 
scsi_dh_emc scsi_dh_alua nfsd auth_rpcgss sch_fq_codel nfs_acl lockd grace 
sunrpc ip_tables x_tables autofs4 btrfs xor raid6_pq nouveau bnx2x ast 
i2c_algo_bit ttm drm_kms_helper mdio libcrc32c crc32c_vpmsum mlx5_core 
syscopyarea sysfillrect sysimgblt fb_sys_fops tg3 drm ahci mlxfw libahci nvme 
devlink nvme_core
  [   85.732704] CPU: 10 PID: 4316 Comm: bash Not tainted 4.13.0-17-generic 
#20-Ubuntu
  [   85.732764] task: c000003fcb141700 task.stack: c000003fc2374000
  [   85.732858] NIP: c00000000078f588 LR: c0000000007904b8 CTR: 
c00000000078f560
  [   85.732977] REGS: c000003fc23779f0 TRAP: 0300   Not tainted  
(4.13.0-17-generic)
  [   85.733066] MSR: 9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>
  [   85.733075]   CR: 28422222  XER: 20040000
- [   85.733201] CFAR: c0000000007904b4 DAR: 0000000000000000 DSISR: 42000000 
SOFTE: 1 
- [   85.733201] GPR00: c0000000007904b8 c000003fc2377c70 c0000000015f6000 
0000000000000063 
- [   85.733201] GPR04: c000003feedfade8 c000003feee12068 9000000000009033 
000000000000000a 
- [   85.733201] GPR08: 0000000000000007 0000000000000001 0000000000000000 
9000000000001003 
- [   85.733201] GPR12: c00000000078f560 c000000007a66900 0000000010180df8 
0000000010189e30 
- [   85.733201] GPR16: 0000000010189ea8 0000000010151210 000000001018bd58 
000000001018de48 
- [   85.733201] GPR20: 00000000321168d8 0000000000000001 0000000010164590 
0000000010163bb0 
- [   85.733201] GPR24: 00007fffcfa6e7d4 00007fffcfa6e7d0 c0000000014fa570 
0000000000000002 
- [   85.733201] GPR28: 0000000000000063 0000000000000004 c0000000014822f4 
c0000000014fa910 
+ [   85.733201] CFAR: c0000000007904b4 DAR: 0000000000000000 DSISR: 42000000 
SOFTE: 1
+ [   85.733201] GPR00: c0000000007904b8 c000003fc2377c70 c0000000015f6000 
0000000000000063
+ [   85.733201] GPR04: c000003feedfade8 c000003feee12068 9000000000009033 
000000000000000a
+ [   85.733201] GPR08: 0000000000000007 0000000000000001 0000000000000000 
9000000000001003
+ [   85.733201] GPR12: c00000000078f560 c000000007a66900 0000000010180df8 
0000000010189e30
+ [   85.733201] GPR16: 0000000010189ea8 0000000010151210 000000001018bd58 
000000001018de48
+ [   85.733201] GPR20: 00000000321168d8 0000000000000001 0000000010164590 
0000000010163bb0
+ [   85.733201] GPR24: 00007fffcfa6e7d4 00007fffcfa6e7d0 c0000000014fa570 
0000000000000002
+ [   85.733201] GPR28: 0000000000000063 0000000000000004 c0000000014822f4 
c0000000014fa910
  [   85.734116] NIP [c00000000078f588] sysrq_handle_crash+0x28/0x30
  [   85.734211] LR [c0000000007904b8] __handle_sysrq+0xf8/0x2b0
  [   85.734285] Call Trace:
  [   85.734316] [c000003fc2377c70] [c000000000790498] 
__handle_sysrq+0xd8/0x2b0 (unreliable)
  [   85.734418] [c000003fc2377d10] [c000000000790cb4] 
write_sysrq_trigger+0x64/0x90
  [   85.734541] [c000003fc2377d40] [c00000000044c0c8] proc_reg_write+0x88/0xd0
  [   85.734656] [c000003fc2377d70] [c00000000039db8c] __vfs_write+0x3c/0x70
  [   85.734741] [c000003fc2377d90] [c00000000039f7c8] vfs_write+0xd8/0x220
  [   85.734845] [c000003fc2377de0] [c0000000003a1648] SyS_write+0x68/0x110
  [   85.734959] [c000003fc2377e30] [c00000000000b184] system_call+0x58/0x6c
  [   85.735053] Instruction dump:
- [   85.735102] 4bfff9f1 4bfffe50 3c4c00e6 38426aa0 7c0802a6 60000000 39200001 
3d42001d 
- [   85.735209] 394ad788 912a0000 7c0004ac 39400000 <992a0000> 4e800020 
3c4c00e6 38426a70 
+ [   85.735102] 4bfff9f1 4bfffe50 3c4c00e6 38426aa0 7c0802a6 60000000 39200001 
3d42001d
+ [   85.735209] 394ad788 912a0000 7c0004ac 39400000 <992a0000> 4e800020 
3c4c00e6 38426a70
  [   85.735319] ---[ end trace 711a5d30c86f0359 ]---
- [   86.742408] 
+ [   86.742408]
  [   86.742572] Sending IPI to o[  184.508956788,5] OPAL: Switch to big-endian 
OS
  ther CPUs
  [   86.743959] IPI[  187.269699704,5] OPAL: Switch to little-endian OS
-  complete
+  complete
  [   86.749730] kexec: waiting for cpu 16 (physical 16) to enter OPAL
  [   88.909562] kexec: Starting switchover sequence.
  [    0.000000] OF: reserved mem: not enough space all defined regions.
  [    0.000000] OF: reserved mem: not enough space all defined regions.
  [    0.000000] OF: reserved mem: not enough space all defined regions.
  [    0.000000] OF: reserved mem: not enough space all defined regions.
-  -> smp_release_cpus()
+  -> smp_release_cpus()
  spinning_secondaries = 159
-  <- smp_release_cpus()
+  <- smp_release_cpus()
  [    1.433082] Unable to open file: /etc/keys/x509_ima.der (-2)
  [    1.433086] Unable to open file: /etc/keys/x509_evm.der (-2)
  [    1.500890] vio vio: uevent: failed to send synthetic uevent
  [    1.670523] nouveau 0004:04:00.0: unknown chipset (140000a1)
  [    1.670632] nouveau 0004:05:00.0: unknown chipset (140000a1)
  [    1.670768] nouveau 0035:03:00.0: unknown chipset (140000a1)
  [    1.670872] nouveau 0035:04:00.0: unknown chipset (140000a1)
  /dev/nvme0n1p2: recovering journal
  /dev/nvme0n1p2: clean, 81988/25001984 files, 3168955/99997696 blocks
  [    4.042999] vio vio: uevent: failed to send synthetic uevent
  
- 
- 
  .  .  .  .[    8.856941] Kernel panic - not syncing: Out of memory and no 
killable processes...
- [    8.856941] 
+ [    8.856941]
  [    8.857009] CPU: 0 PID: 175 Comm: kworker/u8:5 Tainted: G        W       
4.13.0-17-generic #20-Ubuntu
  [    8.857131] Workqueue: mkey_cache cache_work_func [mlx5_ib]
  [    8.857188] Call Trace:
  [    8.857215] [c0000000ea90b4f0] [c000000008c5365c] dump_stack+0xb0/0xf4 
(unreliable)
  [    8.857297] [c0000000ea90b530] [c0000000080f9e2c] panic+0x144/0x338
  [    8.857381] [c0000000ea90b5c0] [c0000000082bf530] out_of_memory+0x3e0/0x6f0
  [    8.857456] [c0000000ea90b660] [c0000000082c71a8] 
__alloc_pages_nodemask+0xfe8/0x1080
  [    8.857568] [c0000000ea90b860] [c00000000834d2b0] 
alloc_pages_current+0xa0/0x140
  [    8.857658] [c0000000ea90b8a0] [c00000000835c834] new_slab+0x3d4/0x810
  [    8.857752] [c0000000ea90b970] [c00000000835e96c] ___slab_alloc+0x3fc/0x7a0
  [    8.857826] [c0000000ea90baa0] [c00000000835ed44] __slab_alloc+0x34/0x60
  [    8.857901] [c0000000ea90bad0] [c00000000835f1a4] 
kmem_cache_alloc_trace+0x124/0x300
  [    8.857977] [c0000000ea90bb30] [c0080000033863e4] add_keys+0x5c/0x3a0 
[mlx5_ib]
  [    8.858083] [c0000000ea90bc20] [c008000003386d84] 
__cache_work_func+0x12c/0x2b0 [mlx5_ib]
  [    8.858199] [c0000000ea90bca0] [c000000008120a58] 
process_one_work+0x298/0x5a0
  [    8.858299] [c0000000ea90bd30] [c000000008120de8] worker_thread+0x88/0x620
  [    8.858383] [c0000000ea90bdc0] [c000000008129c5c] kthread+0x1ac/0x1c0
  [    8.858476] [c0000000ea90be30] [c00000000800b4e8] 
ret_from_kernel_thread+0x5c/0x74
  [    9.666011] ---[ end Kernel panic - not syncing: Out of memory and no 
killable processes...
  [    9.666
  
  > System is available for debugging.
  
- 
- == Comment: #1 - INDIRA P. JOGA 
+ == Comment: #1 - INDIRA P. JOGA
  > Tried with default crash kernel parameter
  crashkernel=2G-4G:320M,4G-32G:512M,32G-64G:1024M,64G-128G:2048M,128G-:4096M  
and still facing same kernel panic hung issue with the system
  
  root@whip:~# cat /proc/cmdline
  root=UUID=46c6aa02-8215-44cc-b3fc-0bc79c3c8815 ro splash quiet 
crashkernel=2G-4G:320M,4G-32G:512M,32G-64G:1024M,64G-128G:2048M,128G-:4096M
  
- 
- > Triggered crash and it hangs 
+ > Triggered crash and it hangs
  Regards,
  Indira
  
- 
- == Comment: #6 - INDIRA P. JOGA 
+ == Comment: #6 - INDIRA P. JOGA
  Hi Urvashi,
  
  Tried triggering crash with higher crash kernel parameter like 16384M
  and i see issue recreated like it throws similar messages like "uevent:
  failed to send synthetic uevent" but it rebooted and hangs this time
  while booting.
  
- 
  > Crashkernel parameter
  
  root@whip:~# cat /proc/cmdline
  root=UUID=2f422155-c251-4bc1-abd8-47cb17a13e65 ro splash quiet 
crashkernel=16384M
  
- 
- > Triggered crash 
+ > Triggered crash
  
  > Attached console logs
  
  Regards,
  Indira
  
  == Comment: #8 - INDIRA P. JOGA <[email protected]> - 2018-01-03 
22:45:02 ==
  > kexectools and makeudmp file verisons from test system
  
  kexec-tools/bionic,now 1:2.0.15-0ubuntu1 ppc64el [installed,automatic]
  makedumpfile/bionic,now 1:1.6.2-1ubuntu1 ppc64el [installed,automatic]
  
  Regards,
  Indira
  
- 
  == Comment: #10 - Hari Krishna Bathini <[email protected]> - 2018-01-04 
04:50:01 ==
  (In reply to comment #6)
  
  This is due to checkstop that occurs on systems with GPUs connected.
  The below two patches are need to fix this:
  
- 
-   commit aec4d0f7a2502a13fc21e90ff32dc306b0ad1190
-   Author: Hari Bathini <[email protected]>
-   Date:   Thu Aug 17 18:01:51 2017 +0530
- 
-     kexec-tools: ppc64: avoid adding coherent memory regions to crash memory 
ranges
-     
-     Accelerator devices like GPU and FPGA cards contain onboard memory. This
-     onboard memory is represented as a memory only NUMA node, integrating it
-     with core memory subsystem. Since, the link through which these devices
-     are integrated to core memory goes down after a system crash and they are
-     meant for user workloads, avoid adding coherent device memory regions to
-     crash memory ranges. Without this change, makedumpfile tool tries to save
-     unaccessible coherent device memory regions, crashing the system.
-     
-     Signed-off-by: Hari Bathini <[email protected]>
-     Tested-by: Pingfan Liu <[email protected]>
-     Signed-off-by: Simon Horman <[email protected]>
+   commit aec4d0f7a2502a13fc21e90ff32dc306b0ad1190
+   Author: Hari Bathini <[email protected]>
+   Date:   Thu Aug 17 18:01:51 2017 +0530
+ 
+     kexec-tools: ppc64: avoid adding coherent memory regions to crash
+ memory ranges
+ 
+     Accelerator devices like GPU and FPGA cards contain onboard memory. This
+     onboard memory is represented as a memory only NUMA node, integrating it
+     with core memory subsystem. Since, the link through which these devices
+     are integrated to core memory goes down after a system crash and they are
+     meant for user workloads, avoid adding coherent device memory regions to
+     crash memory ranges. Without this change, makedumpfile tool tries to save
+     unaccessible coherent device memory regions, crashing the system.
+ 
+     Signed-off-by: Hari Bathini <[email protected]>
+     Tested-by: Pingfan Liu <[email protected]>
+     Signed-off-by: Simon Horman <[email protected]>
  --
  
-   commit 69431282f075ab723c4886f20aa248976920aaae
-   Author: Hari Bathini <[email protected]>
-   Date:   Tue Aug 29 23:08:02 2017 +0530
- 
-     kexec-tools: ppc64: fix leak while checking for coherent device memory
-     
-     Signed-off-by: Hari Bathini <[email protected]>
-     Signed-off-by: Simon Horman <[email protected]>
+   commit 69431282f075ab723c4886f20aa248976920aaae
+   Author: Hari Bathini <[email protected]>
+   Date:   Tue Aug 29 23:08:02 2017 +0530
+ 
+     kexec-tools: ppc64: fix leak while checking for coherent device
+ memory
+ 
+     Signed-off-by: Hari Bathini <[email protected]>
+     Signed-off-by: Simon Horman <[email protected]>
  --
- 
  
  Thanks
  Hari
  
  == Comment: #11 - Hari Krishna Bathini <[email protected]> - 2018-01-04 
04:52:40 ==
  urvashi, also check if the below patches are included in the kexec-tools 
shipped
  with 18.04. If not, they should also be included:
  
-   commit 21eb397a5fc9227cd95d23e8c74a49cf6a293e57
-   Author: Hari Bathini <[email protected]>
-   Date:   Wed Aug 9 23:47:42 2017 +0530
- 
-     kexec-tools: powerpc: fix command line overflow error
-     
-     Since kernel commit a5980d064fe2 ("powerpc: Bump COMMAND_LINE_SIZE
-     to 2048"), powerpc bumped command line size to 2048 but the size
-     used here is still the default value of 512. Bump it to 2048 to
-     fix command line overflow errors observed when command line length
-     is above 512 bytes. Also, get rid of the multiple definitions of
-     COMMAND_LINE_SIZE macro in ppc architecture.
-     
-     Signed-off-by: Hari Bathini <[email protected]>
-     Signed-off-by: Simon Horman <[email protected]>
+   commit 21eb397a5fc9227cd95d23e8c74a49cf6a293e57
+   Author: Hari Bathini <[email protected]>
+   Date:   Wed Aug 9 23:47:42 2017 +0530
+ 
+     kexec-tools: powerpc: fix command line overflow error
+ 
+     Since kernel commit a5980d064fe2 ("powerpc: Bump COMMAND_LINE_SIZE
+     to 2048"), powerpc bumped command line size to 2048 but the size
+     used here is still the default value of 512. Bump it to 2048 to
+     fix command line overflow errors observed when command line length
+     is above 512 bytes. Also, get rid of the multiple definitions of
+     COMMAND_LINE_SIZE macro in ppc architecture.
+ 
+     Signed-off-by: Hari Bathini <[email protected]>
+     Signed-off-by: Simon Horman <[email protected]>
  --
  
-   commit 47478ea66d4301b12a07862aebc8447a2932f0ed
-   Author: Hari Bathini <[email protected]>
-   Date:   Wed Jul 26 22:49:41 2017 +0530
- 
-     kexec-tools: ppc64: fix how RMA top is deduced
-     
-     Hang was observed, in purgatory, on a machine configured with
-     single LPAR. This was because one of the segments was loaded
-     outside the actual Real Memory Area (RMA) due to wrongly
-     deduced RMA top value.
-     
-     Currently, top of real memory area, which is crucial for loading
-     kexec/kdump kernel, is obtained by iterating through mem nodes
-     and setting its value based on the base and size values of the
-     last mem node in the iteration. That can't always be correct as
-     the order of iteration may not be same and RMA base & size are
-     always based on the first memory property. Fix this by setting
-     RMA top value based on the base and size values of the memory
-     node that has the smallest base value (first memory property)
-     among all the memory nodes.
-     
-     Also, correct the misnomers rmo_base and rmo_top to rma_base
-     and rma_top respectively.
-     
-     While how RMA top is deduced was broken for sometime, the issue
-     may not have been seen so far, for couple of possible reasons:
-     
-         1. Only one mem node was available.
-         2. First memory property has been the last node in
-            iteration when multiple mem nodes were present.
-     
-     Fixes: 02f4088ffded ("kexec fix ppc64 device-tree mem node")
-     Reported-by: Ankit Kumar <[email protected]>
-     Cc: Michael Ellerman <[email protected]>
-     Cc: Geoff Levand <[email protected]>
-     Signed-off-by: Hari Bathini <[email protected]>
-     Signed-off-by: Simon Horman <[email protected]>
+   commit 47478ea66d4301b12a07862aebc8447a2932f0ed
+   Author: Hari Bathini <[email protected]>
+   Date:   Wed Jul 26 22:49:41 2017 +0530
+ 
+     kexec-tools: ppc64: fix how RMA top is deduced
+ 
+     Hang was observed, in purgatory, on a machine configured with
+     single LPAR. This was because one of the segments was loaded
+     outside the actual Real Memory Area (RMA) due to wrongly
+     deduced RMA top value.
+ 
+     Currently, top of real memory area, which is crucial for loading
+     kexec/kdump kernel, is obtained by iterating through mem nodes
+     and setting its value based on the base and size values of the
+     last mem node in the iteration. That can't always be correct as
+     the order of iteration may not be same and RMA base & size are
+     always based on the first memory property. Fix this by setting
+     RMA top value based on the base and size values of the memory
+     node that has the smallest base value (first memory property)
+     among all the memory nodes.
+ 
+     Also, correct the misnomers rmo_base and rmo_top to rma_base
+     and rma_top respectively.
+ 
+     While how RMA top is deduced was broken for sometime, the issue
+     may not have been seen so far, for couple of possible reasons:
+ 
+         1. Only one mem node was available.
+         2. First memory property has been the last node in
+            iteration when multiple mem nodes were present.
+ 
+     Fixes: 02f4088ffded ("kexec fix ppc64 device-tree mem node")
+     Reported-by: Ankit Kumar <[email protected]>
+     Cc: Michael Ellerman <[email protected]>
+     Cc: Geoff Levand <[email protected]>
+     Signed-off-by: Hari Bathini <[email protected]>
+     Signed-off-by: Simon Horman <[email protected]>
  --
  
  Thanks
  Hari
  
  == Comment:
  
  As discussed with Hari,
  
  commit 21eb397a5fc9227cd95d23e8c74a49cf6a293e57
-     kexec-tools: powerpc: fix command line overflow error
+     kexec-tools: powerpc: fix command line overflow error
  
  commit 47478ea66d4301b12a07862aebc8447a2932f0ed
-     kexec-tools: ppc64: fix how RMA top is deduced 
+     kexec-tools: ppc64: fix how RMA top is deduced
  
  Both the commits are not present in Ubuntu 18.04
  
  Comment 19 Urvashi Jawere
  
  Hi Indira,
  
  Installed updated kexec tools (which includes all the 4 patches) in the
  whip system. Please retest and update us with your findings.
- 
  
  == Comment: #20 - INDIRA P. JOGA <[email protected]> - 2018-01-10 
10:40:06 ==
  Hi Urvashi,
  
  I have tried the kdump (triggering crash) test scenario and observed
  below findings
  
  -->> For both default and 4096M crash kernel parameters system did not
  hang when triggered crash and it booted it up properly but did not find
  /var/crash logs saved.
  
  Regards,
  Indira
  
  == Comment: #
  
  Hi,
  Urvashi/Indira, that is probably because of the below..
  
  >[  154.241351] Faulting instruction address: 0xc000000000792f88
  > [  154.241447] Oops: Kernel access of bad area, sig: 11 [#1]
- > [  154.241517] SMP NR_CPUS=2048 
- > [  154.241518] NUMA 
+ > [  154.241517] SMP NR_CPUS=2048
+ > [  154.241518] NUMA
  >
- 
  
  The above shows that kdump kernel has failed with OOM.
  Can you try using "crashkernel=8192M" and see if dump capture is successful 
with that?
  
  Thanks
  Hari
  
- == Comment: #24 - INDIRA P. JOGA 
+ == Comment: #24 - INDIRA P. JOGA
  Hi Hari/Urvashi,
  
  Tried kdump (triggering crash) test scenario with "crashkernel=8192M"
  and crash logs are saved as below
  
  > root@whip:~# cat /proc/cmdline
  root=UUID=49ed695c-a4dc-4e55-9ca6-319145265826 ro splash quiet 
crashkernel=8192M
  
  root@whip:~# free -h
-               total        used        free      shared  buff/cache   
available
+               total        used        free      shared  buff/cache   
available
  Mem:           503G        3.2G        499G         12M        909M        
498G
  Swap:          2.0G          0B        2.0G
  
  > root@whip:~# kdump-config show
  DUMP_MODE:        kdump
  USE_KDUMP:        1
  KDUMP_SYSCTL:     kernel.panic_on_oops=1
  KDUMP_COREDIR:    /var/crash
- crashkernel addr: 
-    /var/lib/kdump/vmlinuz: symbolic link to /boot/vmlinux-4.13.0-25-generic
- kdump initrd: 
-    /var/lib/kdump/initrd.img: symbolic link to 
/var/lib/kdump/initrd.img-4.13.0-25-generic
+ crashkernel addr:
+    /var/lib/kdump/vmlinuz: symbolic link to /boot/vmlinux-4.13.0-25-generic
+ kdump initrd:
+    /var/lib/kdump/initrd.img: symbolic link to 
/var/lib/kdump/initrd.img-4.13.0-25-generic
  current state:    ready to kdump
  
  kexec command:
-   /sbin/kexec -p 
--command-line="root=UUID=49ed695c-a4dc-4e55-9ca6-319145265826 ro splash quiet 
irqpoll noirqdistrib nr_cpus=1 nousb systemd.unit=kdump-tools.service 
ata_piix.prefer_ms_hyperv=0" --initrd=/var/lib/kdump/initrd.img 
/var/lib/kdump/vmlinuz
+   /sbin/kexec -p 
--command-line="root=UUID=49ed695c-a4dc-4e55-9ca6-319145265826 ro splash quiet 
irqpoll noirqdistrib nr_cpus=1 nousb systemd.unit=kdump-tools.service 
ata_piix.prefer_ms_hyperv=0" --initrd=/var/lib/kdump/initrd.img 
/var/lib/kdump/vmlinuz
  
  > root@whip:~# kdump-config status
  current state   : ready to kdump
  root@whip:~# sysctl -w kernel.sysrq=1
  kernel.sysrq = 1
- root@whip:~# 
+ root@whip:~#
  
  > Triggered crash as below
  
  root@whip:~# echo c > /proc/sysrq-trigger
  [  533.615228] sysrq: SysRq : Trigger a crash
  [  533.615288] Unable to handle kernel paging request for data at address 
0x00000000
  [  533.615397] Faulting instruction address: 0xc000000000792f88
  [  533.615479] Oops: Kernel access of bad area, sig: 11 [#1]
- [  533.615550] SMP NR_CPUS=2048 
- [  533.615551] NUMA 
+ [  533.615550] SMP NR_CPUS=2048
+ [  533.615551] NUMA
  [  533.615590] PowerNV
  [  533.615657] Modules linked in: rpcsec_gss_krb5 nfsv4 nfs fscache sctp_diag 
sctp dccp_diag dccp tcp_diag udp_diag raw_diag inet_diag unix_diag 
af_packet_diag netlink_diag vmx_crypto crct10dif_vpmsum idt_89hpesx ofpart 
cmdlinepart ipmi_powernv powernv_flash ipmi_devintf ibmpowernv ipmi_msghandler 
mtd opal_prd at24 uio_pdrv_genirq uio binfmt_misc dm_multipath scsi_dh_rdac 
scsi_dh_emc scsi_dh_alua sch_fq_codel nfsd auth_rpcgss nfs_acl lockd grace 
sunrpc ip_tables x_tables autofs4 btrfs xor raid6_pq uas usb_storage nouveau 
bnx2x ast i2c_algo_bit ttm drm_kms_helper mdio libcrc32c mlx5_core 
crc32c_vpmsum syscopyarea sysfillrect sysimgblt fb_sys_fops tg3 drm ahci 
libahci mlxfw nvme devlink nvme_core
  [  533.616497] CPU: 10 PID: 4887 Comm: bash Not tainted 4.13.0-25-generic 
#29-Ubuntu
  [  533.616600] task: c000003fa20ab200 task.stack: c000003fa80d4000
  [  533.616702] NIP: c000000000792f88 LR: c000000000793eb8 CTR: 
c000000000792f60
  [  533.616795] REGS: c000003fa80d79f0 TRAP: 0300   Not tainted  
(4.13.0-25-generic)
  [  533.616895] MSR: 9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>
  [  533.616904]   CR: 28422222  XER: 20040000
  
  > Crash logs as below
  
  root@whip:~# ls -lr /var/crash
  total 21528
  -rw-r----- 1 root root 22002927 Jan 12 04:06 
_usr_lpp_htx_bin_hxestorage.0.crash
  -rw-r----- 1 root root    28945 Jan 16 01:14 
linux-image-4.13.0-25-generic-201801160110.crash
  -rw-r--r-- 1 root root      251 Jan 16 01:14 kexec_cmd
  drwxr-xr-x 2 root root     4096 Jan 16 01:11 201801160110
  root@whip:~# date
  Tue Jan 16 01:14:44 CST 2018
  
  Regards,
  Indira

-- 
You received this bug notification because you are a member of Ubuntu
Bugs, which is subscribed to Ubuntu.
https://bugs.launchpad.net/bugs/1743529

Title:
  Merge kexec-tools 2.0.16-1 from Debian: System hung with Kernel panic
  -not syncing: Out of memory message when crash is triggered.

To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu-power-systems/+bug/1743529/+subscriptions

-- 
ubuntu-bugs mailing list
[email protected]
https://lists.ubuntu.com/mailman/listinfo/ubuntu-bugs

Reply via email to