Hi, I have a guest VM which becomes unresponsive after period, sometimes a few hours, sometimes up to a day or so.
What can I do at the qemu level to start working out where things are going wrong? guest: linux-4.3 host: linux-3.18.21, qemu-2.1, ceph-0.94.5 Prior to the problem the guest is running 30 or so ryncs inbound from the virtio network, writing about 20MB/s the one multi-TB xfs fs on a virtio disk backed by ceph rbd via librbd. "Unresponsive" means the guest stops responding to the network (ssh, pings) and other network traffic (i.e. the rsyncs) stops, the spice console shows a login message but doesn't respond to keystrokes, the serial console (virsh console vm102) also doesn't respond. The only way out so far has been to kill the qemu process (virsh destroy). After rebooting there's nothing in the guest's logs to indicate what might have happened. The host is running around 20 other VMs with a mix of various windows and linux versions on the same ceph infrastructure without any other problems. Whilst the problem is occuring the qemu process builds up cpu time slowly, e.g. ps shows it used 3 seconds of cpu time in 60 seconds elapsed. 'perf top -p pid' shows: Samples: 18K of event 'cycles', Event count (approx.): 1198214862 Overhead Shared Object Symbol 16.48% [kernel] [k] __lock_acquire.isra.32 5.81% [kernel] [k] lock_release 4.05% [kernel] [k] lock_acquired 3.66% [kernel] [k] native_sched_clock 3.40% [kernel] [k] kvm_arch_vcpu_ioctl_run 3.35% [kernel] [k] lock_acquire 2.48% [kernel] [k] lock_release_holdtime.part.23 2.31% [kernel] [k] vmx_vcpu_run 2.24% [kernel] [k] __fget 1.33% [kernel] [k] do_sys_poll 1.29% [kernel] [k] _raw_spin_unlock_irqrestore 1.19% [kernel] [k] generic_exec_single 1.09% [kernel] [k] _raw_spin_lock_irqsave 0.97% [kernel] [k] vmx_save_host_state 0.90% [kernel] [k] timerqueue_add 0.89% [kernel] [k] __srcu_read_lock 0.88% [kernel] [k] do_raw_spin_trylock 0.71% [kernel] [k] vmcs_writel 0.68% [kernel] [k] __hrtimer_start_range_ns 0.64% [kernel] [k] set_spte 0.63% [kernel] [k] add_atomic_switch_msr 0.60% [kernel] [k] fput 0.58% [kernel] [k] kvm_read_guest_cached 0.57% [kernel] [k] copy_user_generic_string 0.53% [kernel] [k] eventfd_poll 0.52% qemu-system-x86_64 [.] 0x00000000003759ac 0.49% [kernel] [k] vmx_handle_exit 0.49% [kernel] [k] __apic_accept_irq 0.49% [kernel] [k] dequeue_task_fair 0.49% [kernel] [k] kvm_arch_vcpu_runnable 0.48% [kernel] [k] kvm_set_shared_msr 0.47% [kernel] [k] local_clock 0.46% [kernel] [k] vmx_vcpu_load 0.45% [kernel] [k] find_busiest_group 0.44% [kernel] [k] __perf_event_task_sched_out 0.44% [kernel] [k] __schedule 0.43% [kernel] [k] prepare_to_wait 0.43% [kernel] [k] update_curr 0.41% [kernel] [k] do_raw_spin_unlock 0.41% [kernel] [k] tcp_poll 0.40% [kernel] [k] kvm_arch_vcpu_load 0.39% [kernel] [k] kvm_lapic_sync_to_vapic 0.39% [kernel] [k] apic_update_ppr 0.38% [kernel] [k] vmcs_load 0.38% [kernel] [k] vmx_interrupt_allowed 0.37% [kernel] [k] kvm_check_async_pf_completion 0.37% [kernel] [k] remove_wait_queue 0.37% [kernel] [k] kvm_apic_has_interrupt 0.35% [kernel] [k] pick_next_task_fair 0.35% [kernel] [k] change_protection 0.34% [kernel] [k] vmx_sched_in 0.33% [kernel] [k] vmx_inject_irq 0.32% [kernel] [k] gup_pte_range 0.32% [kernel] [k] match_held_lock 0.31% [kernel] [k] kvm_get_apic_interrupt 0.31% [kernel] [k] apic_reg_write 0.31% [kernel] [k] handle_mm_fault 0.30% [kernel] [k] apic_has_pending_timer 0.30% [kernel] [k] apic_set_eoi 0.30% [kernel] [k] kvm_write_guest_cached 0.30% [kernel] [k] cpuacct_charge 0.30% [kernel] [k] copy_page_rep 0.29% [kernel] [k] kvm_vcpu_block 0.29% [kernel] [k] __pollwait 0.29% [kernel] [k] kvm_set_msr_common 0.28% [kernel] [k] kvm_ioapic_handles_vector 0.28% [kernel] [k] mmu_set_spte 0.28% [kernel] [k] kvm_sched_in 0.28% [kernel] [k] vmx_set_msr 0.27% [kernel] [k] sock_poll 0.27% [kernel] [k] update_cr8_intercept 0.27% [kernel] [k] update_cfs_shares 0.26% librt-2.13.so [.] clock_gettime 0.26% [kernel] [k] kvm_apic_accept_pic_intr 0.25% [kernel] [k] follow_page_pte 0.25% libpthread-2.13.so [.] pthread_mutex_lock 0.24% [kernel] [k] account_entity_dequeue 0.24% libglib-2.0.so.0.3200.4 [.] g_main_context_check 0.23% [kernel] [k] ktime_get 0.23% [kernel] [k] vmx_flush_tlb 0.22% [kernel] [k] intel_pmu_disable_all 0.22% [kernel] [k] clear_atomic_switch_msr The guest is managed by libvirt and started like: qemu-system-x86_64 -enable-kvm -name vm102 -S -machine pc-i440fx-2.1,accel=kvm,usb=off -cpu Westmere -m 8192 -realtime mlock=off -smp 2,sockets=2,cores=1,threads=1 -uuid xxxxxxxx -no-user-config -nodefaults -chardev socket,id=charmonitor,path=/var/lib/libvirt/qemu/vm102.monitor,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc,driftfix=slew -global kvm-pit.lost_tick_policy=discard -no-hpet -no-shutdown -global PIIX4_PM.disable_s3=1 -global PIIX4_PM.disable_s4=1 -boot strict=on -device ich9-usb-ehci1,id=usb,bus=pci.0,addr=0x5.0x7 -device ich9-usb-uhci1,masterbus=usb.0,firstport=0,bus=pci.0,multifunction=on,addr=0x5 -device ich9-usb-uhci2,masterbus=usb.0,firstport=2,bus=pci.0,addr=0x5.0x1 -device ich9-usb-uhci3,masterbus=usb.0,firstport=4,bus=pci.0,addr=0x5.0x2 -device virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x6 -drive if=none,id=drive-ide0-0-0,readonly=on,format=raw -device ide-cd,bus=ide.0,unit=0,drive=drive-ide0-0-0,id=ide0-0-0 -drive file=rbd:rbd-sas/vm102%root:id=u102:key=xxxxxxxxx==:auth_supported=cephx\;none,if=none,id=drive-virtio-disk0,format=raw -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x8,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1 -drive file=rbd:rbd-sas/vm102%swap:id=u102:key=xxxxxxxxx==:auth_supported=cephx\;none,if=none,id=drive-virtio-disk1,format=raw -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x9,drive=drive-virtio-disk1,id=virtio-disk1 -drive file=rbd:rbd-sas/vm102%tmp:id=u102:key=xxxxxxxxx==:auth_supported=cephx\;none,if=none,id=drive-virtio-disk2,format=raw -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0xa,drive=drive-virtio-disk2,id=virtio-disk2 -drive file=rbd:rbd/vm102%var:id=u102:key=xxxxxxxxx==:auth_supported=cephx\;none,if=none,id=drive-virtio-disk3,format=raw -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0xb,drive=drive-virtio-disk3,id=virtio-disk3 -drive file=rbd:rbd-x3/vm102%data:id=u102:key=xxxxxxxxx==:auth_supported=cephx\;none,if=none,id=drive-virtio-disk4,format=raw -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0xc,drive=drive-virtio-disk4,id=virtio-disk4 -netdev tap,fd=29,id=hostnet0 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:8e:3c:37,bus=pci.0,addr=0x3 -chardev socket,id=charchannel0,path=/var/lib/libvirt/qemu/channel/target/vm102.org.qemu.guest_agent.0,server,nowait -device virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,id=channel0,name=org.qemu.guest_agent.0 -chardev spicevmc,id=charchannel1,name=vdagent -device virtserialport,bus=virtio-serial0.0,nr=2,chardev=charchannel1,id=channel1,name=com.redhat.spice.0 -chardev pty,id=charconsole0 -device virtconsole,chardev=charconsole0,id=console0 -device usb-tablet,id=input0 -spice port=5904,addr=127.0.0.1,disable-ticketing,seamless-migration=on -device qxl-vga,id=video0,ram_size=67108864,vram_size=67108864,bus=pci.0,addr=0x2 -chardev spicevmc,id=charredir0,name=usbredir -device usb-redir,chardev=charredir0,id=redir0 -chardev spicevmc,id=charredir1,name=usbredir -device usb-redir,chardev=charredir1,id=redir1 -chardev spicevmc,id=charredir2,name=usbredir -device usb-redir,chardev=charredir2,id=redir2 -chardev spicevmc,id=charredir3,name=usbredir -device usb-redir,chardev=charredir3,id=redir3 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x7 -msg timestamp=on At this point I don't know if this is a guest kernel problem, a qemu problem, a host kernel problem, or ceph problem. What should I be looking at to find out what's actually going wrong? Cheers, Chris
