bug 1709171 now disables CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE, so this bug may be a duplicate of that?
** Changed in: linux (Ubuntu) Status: Incomplete => Triaged -- You received this bug notification because you are a member of Kernel Packages, which is subscribed to linux in Ubuntu. https://bugs.launchpad.net/bugs/1710922 Title: Ubuntu 17.10 ppc64el guest with MEMORY_HOTPLUG_DEFAULT_ONLINE=y gets a "kernel BUG at mm/memory_hotplug.c:2185" when hotplugging LMBs with QEMU upstream Status in The Ubuntu-power-systems project: Incomplete Status in linux package in Ubuntu: Triaged Bug description: == Comment: #0 - Daniel Henrique Barboza <danie...@br.ibm.com> - 2017-08-08 09:13:31 == - Host information: Ubuntu 16.10 running upstream QEMU $ uname -a Linux louis 4.10.0-20-generic #22-Ubuntu SMP Thu Apr 20 09:22:16 UTC 2017 ppc64le ppc64le ppc64le GNU/Linux $ cat /proc/cpuinfo processor : 0 cpu : POWER8E (raw), altivec supported clock : 2061.000000MHz revision : 2.1 (pvr 004b 0201) (...) timebase : 512000000 platform : PowerNV model : 8247-42L machine : PowerNV 8247-42L firmware : OPAL - qemu command line that launched the Ubuntu 17.10 ppc64el guest: sudo ./qemu-system-ppc64 -name migrate_qemu -boot strict=on --enable- kvm -device nec-usb-xhci,id=usb,bus=pci.0,addr=0xf -device spapr- vscsi,id=scsi0,reg=0x2000 -smp 1,maxcpus=4,sockets=4,cores=1,threads=1 --machine pseries,accel=kvm,usb=off,dump-guest-core=off -m 4G,slots=32,maxmem=32G -drive file=/home/danielhb/vm_imgs/ub1710.qcow2,format=qcow2,if=none,id =drive-virtio-disk0,cache=none -device virtio-blk- pci,scsi=off,bus=pci.0,addr=0x2,drive=drive-virtio-disk0,id=virtio- disk0,bootindex=1 -nographic - guest information: Ubuntu 17.10 ppc64el: root@ubuntu1710:~# uname -a Linux ubuntu1710 4.11.0-10-generic #15-Ubuntu SMP Thu Jun 29 15:02:54 UTC 2017 ppc64le ppc64le ppc64le GNU/Linux root@ubuntu1710:~# - Problem: hotplugging a LMB generates a guest kernel Oops: root@ubuntu1710:~# QEMU 2.9.90 monitor - type 'help' for more information (qemu) (qemu) object_add memory-backend-ram,id=ram1,size=1G (qemu) device_add pc-dimm,id=dimm1,memdev=ram1 (qemu) [ 126.850952] kernel BUG at /build/linux-S1V_3d/linux-4.11.0/mm/memory_hotplug.c:2185! [ 126.851285] Oops: Exception in kernel mode, sig: 5 [#1] [ 126.851428] SMP NR_CPUS=2048 [ 126.851428] NUMA [ 126.851546] pSeries [ 126.851714] Modules linked in: vmx_crypto ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ip_tables x_tables autofs4 btrfs raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear ibmvscsi crc32c_vpmsum virtio_blk [ 126.852447] CPU: 0 PID: 5 Comm: kworker/u8:0 Not tainted 4.11.0-10-generic #15-Ubuntu [ 126.852656] Workqueue: pseries hotplug workque pseries_hp_work_fn [ 126.852828] task: c0000000fea80000 task.stack: c0000000fe118000 [ 126.853000] NIP: c000000000350268 LR: c0000000003501e0 CTR: 0000000000000000 [ 126.853190] REGS: c0000000fe11b780 TRAP: 0700 Not tainted (4.11.0-10-generic) [ 126.853390] MSR: 800000000282b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> [ 126.853396] CR: 42002422 XER: 20000000 [ 126.853672] CFAR: c0000000003501e4 SOFTE: 1 [ 126.853672] GPR00: c0000000003501e0 c0000000fe11ba00 c00000000149eb00 0000000000000001 [ 126.853672] GPR04: c0000000f9901480 c0000000ffe21c00 000000000000003e 0000000000000003 [ 126.853672] GPR08: 0000000000000002 0000000000000003 0000000000000003 303078302d303030 [ 126.853672] GPR12: 0000000000002200 c00000000fb80000 c000000000110008 c0000000fe1810c0 [ 126.853672] GPR16: c0000000fe050ea8 0000000000000010 c0000000fffffc30 c0000000fffffea0 [ 126.853672] GPR20: c0000000f951a1a4 0000000000000004 0000000000000001 0000000000000010 [ 126.853672] GPR24: 0000000000000001 c0000000f951a1a0 0000000000000004 0000000000000000 [ 126.853672] GPR28: 0000000000000000 0000000000000001 0000000010000000 0000000140000000 [ 126.855221] NIP [c000000000350268] remove_memory+0xf8/0x100 [ 126.855338] LR [c0000000003501e0] remove_memory+0x70/0x100 [ 126.855453] Call Trace: [ 126.855520] [c0000000fe11ba00] [c0000000003501e0] remove_memory+0x70/0x100 (unreliable) [ 126.855684] [c0000000fe11ba40] [c0000000000b0880] dlpar_add_lmb+0x370/0x3f0 [ 126.855822] [c0000000fe11bb20] [c0000000000b174c] dlpar_memory+0x7cc/0xd20 [ 126.855959] [c0000000fe11bbf0] [c0000000000a9af8] handle_dlpar_errorlog+0xa8/0x170 [ 126.856118] [c0000000fe11bc60] [c0000000000a9c54] pseries_hp_work_fn+0x94/0xa0 [ 126.856275] [c0000000fe11bc90] [c0000000001071d0] process_one_work+0x2b0/0x5a0 [ 126.856430] [c0000000fe11bd20] [c000000000107568] worker_thread+0xa8/0x670 [ 126.856563] [c0000000fe11bdc0] [c000000000110164] kthread+0x164/0x1b0 [ 126.856695] [c0000000fe11be30] [c00000000000b4e8] ret_from_kernel_thread+0x5c/0x74 [ 126.856846] Instruction dump: [ 126.856931] 60000000 387f0060 48824b19 60000000 38210040 e8010010 eb81ffe0 eba1ffe8 [ 126.857088] ebc1fff0 ebe1fff8 7c0803a6 4e800020 <0fe00000> 00000000 3c4c0115 3842e890 [ 126.857243] ---[ end trace 76fab848b8f01d0a ]--- [ 126.859577] Investigating the cause I've found this kernel commit: commit 943db62c316c578f8e2cc6fb81a5f641096b29bf Author: Nathan Fontenot <nf...@linux.vnet.ibm.com> Date: Wed Feb 15 13:45:30 2017 -0500 powerpc/pseries: Revert 'Auto-online hotplugged memory' This reverts commit ec999072442a ("powerpc/pseries: Auto-online hotplugged memory"), and 9dc512819e4b ("powerpc: Fix unused function warning 'lmb_to_memblock'"). Using the auto-online acpability does online added memory but does not update the associated device struct to indicate that the memory is online. This causes the pseries memory DLPAR code to fail when trying to remove a LMB that was previously removed and added back. This happens when validating that the LMB is removable. This patch reverts to the previous behavior of calling device_online() to online the LMB when it is DLPAR added and moves the lmb_to_memblock() routine out of CONFIG_MEMORY_HOTREMOVE now that we call it for add. This commit removed a specific kernel configuration in the revert: --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -58,7 +58,6 @@ CONFIG_KEXEC_FILE=y CONFIG_IRQ_ALL_CPUS=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y Using the vanilla kernel from Linus I've got the following default config for pseries: [danielhb@arthas linux]$ ARCH=powerpc make pseries_defconfig # # configuration written to .config # [danielhb@arthas linux]$ grep -R 'HOTPLUG_DEFAULT' . ./mm/Kconfig:config MEMORY_HOTPLUG_DEFAULT_ONLINE ./mm/memory_hotplug.c:#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE ./.config:# CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE is not set ./Documentation/admin-guide/kernel-parameters.txt: CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config ./Documentation/memory-hotplug.txt:The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config [danielhb@arthas linux]$ As we can see from the grep result, the .config was generated without the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE option and there is no other place in the code where it is set to Y. In the mm/Kconfig we have: (...) config MEMORY_HOTPLUG_DEFAULT_ONLINE bool "Online the newly added memory blocks by default" default n depends on MEMORY_HOTPLUG help (...) This shows that the default value for this option is N, which makes sense with the change made in the patch - the absence of the option in .config disables the auto_online_blocks feature. However, the guest Ubuntu 17.10 ppc64el kernel is setting this option to Y: root@ubuntu1710:~# cat /boot/config-4.11.0-10-generic | grep HOTPLUG_DEFAULT CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y root@ubuntu1710:~# I am not sure if the intent was to enable just for x86 or all architectures but, as is, this is breaking memory hotplug in pseries after the mentioned kernel commit 943db62c316c578f8e2cc6fb81a5f641096b29bf. Given that the default behavior when the option is not set is N, my suggestion is to change the MEMORY_HOTPLUG_DEFAULT_ONLINE to 'not set' in any ppc64el config file in the Ubuntu build, following the defconfig we have in the vanilla kernel. - Workarounds: The most obvious one: if I recompile the Ubuntu 17.10 kernel without this option (or setting it to 'n'), LMB hotplug works. Another possible workaround, further documented in the kernel commits that introduced the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE option, is to manually set the auto_online blocks to 'offline'. Doing that prior to the hotplug avoids the kernel Oops: root@ubuntu1710:~# echo offline > /sys/devices/system/memory/auto_online_blocks root@ubuntu1710:~# root@ubuntu1710:~# grep Mem /proc/meminfo MemTotal: 4169088 kB MemFree: 3725632 kB MemAvailable: 3917056 kB root@ubuntu1710:~# root@ubuntu1710:~# QEMU 2.9.90 monitor - type 'help' for more information (qemu) (qemu) object_add memory-backend-ram,id=ram0,size=1G (qemu) device_add pc-dimm,id=dimm0,memdev=ram0 (qemu) root@ubuntu1710:~# root@ubuntu1710:~# grep Mem /proc/meminfo MemTotal: 5217664 kB MemFree: 4772864 kB MemAvailable: 4956928 kB root@ubuntu1710:~# We can see, it is possible to normally hotplug memory if we manually disable auto_online_blocks. Additional notes: - This same problem was also reported against Fedora 26 guests here: https://bugzilla.redhat.com/show_bug.cgi?id=1476380. - Ever since I've opened this Red Hat bug there were some developments in the PPC kernel mailing list, but none that actually solves the problem seem here in the upstream 4.13+ kernel. The safest course of action is disable this option until things are sorted out upstrea. Let me know if you need any extra information about the issue or the tests. Thanks, Daniel == Comment: #5 - Nathan D. Fontenot <nfont...@us.ibm.com> - 2017-08-15 11:23:54 == This problem is resolved by a recent commit upstream (commit id 1a367063ca0c) commit 1a367063ca0c1c6f6f54b5abd7b4836b0866a07b Author: Nathan Fontenot <nf...@linux.vnet.ibm.com> Date: Wed Aug 2 14:03:22 2017 -0400 powerpc/pseries: Check memory device state before onlining/offlining When DLPAR adding or removing memory we need to check the device offline status before trying to online/offline the memory. This is needed because calls to device_online() and device_offline() will return non-zero for memory that is already online and offline respectively. This update resolves two scenarios. First, for a kernel built with auto-online memory enabled (CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y), memory will be onlined as part of calls to add_memory(). After adding the memory the pseries DLPAR code tries to online it and fails since the memory is already online. The DLPAR code then tries to remove the memory which produces the oops message below because the memory is not offline. The second scenario occurs when removing memory that is already offline, i.e. marking memory offline (via sysfs) and then trying to remove that memory. This doesn't work because offlining the already offline memory does not succeed and the DLPAR code then fails the DLPAR remove operation. The fix for both scenarios is to check the device.offline status before making the calls to device_online() or device_offline(). kernel BUG at mm/memory_hotplug.c:1936! ... NIP [c0000000002ca428] .remove_memory+0xb8/0xc0 LR [c0000000002ca3cc] .remove_memory+0x5c/0xc0 Call Trace: .remove_memory+0x5c/0xc0 (unreliable) .dlpar_add_lmb+0x384/0x400 .dlpar_memory+0x5dc/0xca0 .handle_dlpar_errorlog+0x74/0xe0 .pseries_hp_work_fn+0x2c/0x90 .process_one_work+0x17c/0x460 .worker_thread+0x88/0x500 .kthread+0x15c/0x1a0 .ret_from_kernel_thread+0x58/0xc0 Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged memo Signed-off-by: Nathan Fontenot <nf...@linux.vnet.ibm.com> [mpe: Use bool, add explicit rc=0 case, change log typos & formatting] Signed-off-by: Michael Ellerman <m...@ellerman.id.au> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/pl index ca9b2f4..9e3afd2 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -336,7 +336,38 @@ static struct memory_block *lmb_to_memblock(struct of_drc return mem_block; } +static int dlpar_change_lmb_state(struct of_drconf_cell *lmb, bool online) +{ + struct memory_block *mem_block; + int rc; + + mem_block = lmb_to_memblock(lmb); + if (!mem_block) + return -EINVAL; + + if (online && mem_block->dev.offline) + rc = device_online(&mem_block->dev); + else if (!online && !mem_block->dev.offline) + rc = device_offline(&mem_block->dev); + else + rc = 0; + + put_device(&mem_block->dev); + + return rc; +} + +static int dlpar_online_lmb(struct of_drconf_cell *lmb) +{ + return dlpar_change_lmb_state(lmb, true); +} + #ifdef CONFIG_MEMORY_HOTREMOVE +static int dlpar_offline_lmb(struct of_drconf_cell *lmb) +{ + return dlpar_change_lmb_state(lmb, false); +} + static int pseries_remove_memblock(unsigned long base, unsigned int memblock_ { unsigned long block_sz, start_pfn; @@ -431,19 +462,13 @@ static int dlpar_add_lmb(struct of_drconf_cell *); static int dlpar_remove_lmb(struct of_drconf_cell *lmb) { - struct memory_block *mem_block; unsigned long block_sz; int nid, rc; if (!lmb_is_removable(lmb)) return -EINVAL; - mem_block = lmb_to_memblock(lmb); - if (!mem_block) - return -EINVAL; - - rc = device_offline(&mem_block->dev); - put_device(&mem_block->dev); + rc = dlpar_offline_lmb(lmb); if (rc) return rc; @@ -737,20 +762,6 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static int dlpar_online_lmb(struct of_drconf_cell *lmb) -{ - struct memory_block *mem_block; - int rc; - - mem_block = lmb_to_memblock(lmb); - if (!mem_block) - return -EINVAL; - - rc = device_online(&mem_block->dev); - put_device(&mem_block->dev); - return rc; -} - static int dlpar_add_lmb(struct of_drconf_cell *lmb) { unsigned long block_sz; To manage notifications about this bug go to: https://bugs.launchpad.net/ubuntu-power-systems/+bug/1710922/+subscriptions -- Mailing list: https://launchpad.net/~kernel-packages Post to : kernel-packages@lists.launchpad.net Unsubscribe : https://launchpad.net/~kernel-packages More help : https://help.launchpad.net/ListHelp