[ https://issues.apache.org/jira/browse/CLOUDSTACK-10310?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16585617#comment-16585617 ]
ASF GitHub Bot commented on CLOUDSTACK-10310: --------------------------------------------- DaanHoogland closed pull request #2722: CLOUDSTACK-10310 Fix KVM reboot on storage issue URL: https://github.com/apache/cloudstack/pull/2722 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java index be5ab396d19..f180848a8d5 100644 --- a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java +++ b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java @@ -34,7 +34,8 @@ protected static String s_heartBeatPath; protected long _heartBeatUpdateTimeout = 60000; protected long _heartBeatUpdateFreq = 60000; - protected long _heartBeatUpdateMaxRetry = 3; + protected long _heartBeatUpdateMaxTries = 5; + protected long _heartBeatUpdateRetrySleep = 15000; public static enum PoolType { PrimaryStorage, SecondaryStorage diff --git a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java index 0cebb4c9b00..8a11b7fc962 100644 --- a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java +++ b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java @@ -119,7 +119,8 @@ protected void runInContext() { } String result = null; - for (int i = 0; i < 5; i++) { + // Try multiple times, but sleep in between tries to ensure it isn't a short lived transient error + for (int i = 1; i <= _heartBeatUpdateMaxTries; i++) { Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger); cmd.add("-i", primaryStoragePool._poolIp); cmd.add("-p", primaryStoragePool._poolMountSourcePath); @@ -127,14 +128,21 @@ protected void runInContext() { cmd.add("-h", _hostIP); result = cmd.execute(); if (result != null) { - s_logger.warn("write heartbeat failed: " + result + ", retry: " + i); + s_logger.warn("write heartbeat failed: " + result + ", try: " + i + " of " + _heartBeatUpdateMaxTries); + try { + Thread.sleep(_heartBeatUpdateRetrySleep); + } catch (InterruptedException e) { + s_logger.debug("[ignored] interupted between heartbeat retries."); + } } else { break; } } if (result != null) { - s_logger.warn("write heartbeat failed: " + result + "; reboot the host"); + // Stop cloudstack-agent if can't write to heartbeat file. + // This will raise an alert on the mgmt server + s_logger.warn("write heartbeat failed: " + result + "; stopping cloudstack-agent"); Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger); cmd.add("-i", primaryStoragePool._poolIp); cmd.add("-p", primaryStoragePool._poolMountSourcePath); diff --git a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh index 7c8ee67f30c..30ca72a2aa9 100755 --- a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh +++ b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh @@ -155,10 +155,10 @@ then exit 0 elif [ "$cflag" == "1" ] then - /usr/bin/logger -t heartbeat "kvmheartbeat.sh rebooted system because it was unable to write the heartbeat to the storage." + /usr/bin/logger -t heartbeat "kvmheartbeat.sh stopped cloudstack-agent because it was unable to write the heartbeat to the storage." sync & sleep 5 - echo b > /proc/sysrq-trigger + service cloudstack-agent stop exit $? else write_hbLog ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > KVM hosts reboot if there is a short transient storage error > ------------------------------------------------------------ > > Key: CLOUDSTACK-10310 > URL: https://issues.apache.org/jira/browse/CLOUDSTACK-10310 > Project: CloudStack > Issue Type: Improvement > Security Level: Public(Anyone can view this level - this is the > default.) > Components: KVM > Affects Versions: 4.9.0, 4.10.0.0 > Reporter: Sean Lair > Priority: Major > > If the KVM heartbeat file can't be written to, the host is rebooted, and thus > taking down all VMs running on it. The code does try 5x times before the > reboot, but the there is not a delay between the retires, so they are 5 > simultaneous retries, which doesn't help. Standard SAN storage HA operations > or quick network blip could cause this reboot to occur. > Some discussions on the dev mailing list revealed that some people are just > commenting out the reboot line in their version of the CloudStack source. > A better option (and a new PR is being issued) would be have it sleep between > tries so it isn't 5x almost simultaneous tries. Plus, instead of rebooting, > the cloudstack-agent could just be stopped on the host instead. This will > cause alerts to be issued and if the host is disconnected long-enough, > depending on the HA code in use, VM HA could handle the host failure. > The built-in reboot of the host seemed drastic -- This message was sent by Atlassian JIRA (v7.6.3#76005)