This is an automated email from the ASF dual-hosted git repository.
sureshanaparti pushed a commit to branch 4.22
in repository https://gitbox.apache.org/repos/asf/cloudstack.git
The following commit(s) were added to refs/heads/4.22 by this push:
new 4359198904c KVM Host HA improvements - Fix to not cancel VM HA items
when Host HA inspection in progress, and some code improvements (#13088)
4359198904c is described below
commit 4359198904c54f9b5bc9afa0d9e474beb4ec2d4d
Author: Suresh Kumar Anaparti <[email protected]>
AuthorDate: Fri May 8 19:50:50 2026 +0530
KVM Host HA improvements - Fix to not cancel VM HA items when Host HA
inspection in progress, and some code improvements (#13088)
* Host HA code improvements
* Fix to not cancel VM HA items when Host HA is enabled & inspection in
progress, and some code improvements
- When Host HA inspection in progress, the investigor returns the Host
Status as Up which cancels the VM HA items
- Don't cancel the VM HA items, instead reschedule them to try again later
* Changes to consider Recovered/Available Host HA state along with the
agent connection status to determine the Host HA inspection in progress or not,
and some code improvements
---
api/src/main/java/com/cloud/ha/Investigator.java | 16 +-
.../command/admin/ha/ConfigureHAForHostCmd.java | 1 +
.../com/cloud/agent/api/CheckOnHostAnswer.java | 3 +-
.../com/cloud/agent/api/CheckOnHostCommand.java | 10 +-
.../java/com/cloud/ha/HighAvailabilityManager.java | 4 +-
.../main/java/com/cloud/ha/HypervInvestigator.java | 6 +-
.../main/java/com/cloud/ha/KVMInvestigator.java | 87 +++------
.../cloud/hypervisor/kvm/resource/KVMHABase.java | 11 +-
.../hypervisor/kvm/resource/KVMHAChecker.java | 33 ++--
.../hypervisor/kvm/resource/KVMHAMonitor.java | 65 +++----
.../kvm/resource/KVMHAVMActivityChecker.java | 6 +-
.../kvm/resource/LibvirtComputingResource.java | 6 +-
.../wrapper/LibvirtCheckOnHostCommandWrapper.java | 16 +-
...CheckVMActivityOnStoragePoolCommandWrapper.java | 4 +-
.../kvm/storage/IscsiAdmStoragePool.java | 4 +-
.../hypervisor/kvm/storage/KVMStoragePool.java | 70 ++++----
.../hypervisor/kvm/storage/LibvirtStoragePool.java | 36 ++--
.../hypervisor/kvm/storage/MultipathSCSIPool.java | 6 +-
.../hypervisor/kvm/storage/ScaleIOStoragePool.java | 4 +-
.../org/apache/cloudstack/kvm/ha/KVMHAConfig.java | 23 ++-
.../apache/cloudstack/kvm/ha/KVMHAProvider.java | 7 +-
.../cloudstack/kvm/ha/KVMHostActivityChecker.java | 198 ++++++++++++---------
.../kvm/resource/LibvirtComputingResourceTest.java | 2 +-
.../java/com/cloud/ha/SimulatorInvestigator.java | 4 +-
.../main/java/com/cloud/ha/VmwareInvestigator.java | 2 +-
.../hypervisor/kvm/storage/LinstorStoragePool.java | 14 +-
.../kvm/storage/StorPoolStoragePool.java | 10 +-
scripts/vm/hypervisor/kvm/kvmheartbeat.sh | 4 +-
.../com/cloud/ha/CheckOnAgentInvestigator.java | 2 +-
.../com/cloud/ha/HighAvailabilityManagerImpl.java | 136 +++++++++-----
server/src/main/java/com/cloud/ha/KVMFencer.java | 11 +-
.../cloud/ha/ManagementIPSystemVMInvestigator.java | 2 +-
.../java/com/cloud/ha/UserVmDomRInvestigator.java | 2 +-
.../java/com/cloud/ha/XenServerInvestigator.java | 4 +-
.../java/org/apache/cloudstack/ha/HAManager.java | 7 +-
.../org/apache/cloudstack/ha/HAManagerImpl.java | 72 ++++----
.../apache/cloudstack/ha/HAResourceCounter.java | 6 +-
.../cloudstack/ha/task/ActivityCheckTask.java | 6 +-
.../org/apache/cloudstack/ha/task/BaseHATask.java | 2 +-
.../cloud/ha/HighAvailabilityManagerImplTest.java | 5 +-
.../cloudstack/utils/redfish/RedfishClient.java | 14 +-
41 files changed, 481 insertions(+), 440 deletions(-)
diff --git a/api/src/main/java/com/cloud/ha/Investigator.java
b/api/src/main/java/com/cloud/ha/Investigator.java
index 88d802a1ce4..00371d395f5 100644
--- a/api/src/main/java/com/cloud/ha/Investigator.java
+++ b/api/src/main/java/com/cloud/ha/Investigator.java
@@ -26,17 +26,19 @@ public interface Investigator extends Adapter {
* Returns if the vm is still alive.
*
* @param vm to work on.
+ * @return true if vm is alive, otherwise false
*/
- public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM;
+ boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM;
- public Status isAgentAlive(Host agent);
+ /**
+ * Returns the agent status of the host.
+ *
+ * @param host
+ * @return status of the host agent
+ */
+ Status getHostAgentStatus(Host host);
class UnknownVM extends Exception {
-
- /**
- *
- */
private static final long serialVersionUID = 1L;
-
};
}
diff --git
a/api/src/main/java/org/apache/cloudstack/api/command/admin/ha/ConfigureHAForHostCmd.java
b/api/src/main/java/org/apache/cloudstack/api/command/admin/ha/ConfigureHAForHostCmd.java
index d7707e197d6..ed3b166e6ea 100644
---
a/api/src/main/java/org/apache/cloudstack/api/command/admin/ha/ConfigureHAForHostCmd.java
+++
b/api/src/main/java/org/apache/cloudstack/api/command/admin/ha/ConfigureHAForHostCmd.java
@@ -87,6 +87,7 @@ public final class ConfigureHAForHostCmd extends BaseAsyncCmd
{
final HostHAResponse response = new HostHAResponse();
response.setId(resourceUuid);
response.setProvider(getHaProvider().toLowerCase());
+ response.setStatus(result);
response.setResponseName(getCommandName());
setResponseObject(response);
}
diff --git a/core/src/main/java/com/cloud/agent/api/CheckOnHostAnswer.java
b/core/src/main/java/com/cloud/agent/api/CheckOnHostAnswer.java
index 5a26b22ec6a..41e266784db 100644
--- a/core/src/main/java/com/cloud/agent/api/CheckOnHostAnswer.java
+++ b/core/src/main/java/com/cloud/agent/api/CheckOnHostAnswer.java
@@ -38,6 +38,8 @@ public class CheckOnHostAnswer extends Answer {
public CheckOnHostAnswer(CheckOnHostCommand cmd, String details) {
super(cmd, false, details);
+ determined = false;
+ alive = false;
}
public boolean isDetermined() {
@@ -47,5 +49,4 @@ public class CheckOnHostAnswer extends Answer {
public boolean isAlive() {
return alive;
}
-
}
diff --git a/core/src/main/java/com/cloud/agent/api/CheckOnHostCommand.java
b/core/src/main/java/com/cloud/agent/api/CheckOnHostCommand.java
index 94239f2900e..72b5217604d 100644
--- a/core/src/main/java/com/cloud/agent/api/CheckOnHostCommand.java
+++ b/core/src/main/java/com/cloud/agent/api/CheckOnHostCommand.java
@@ -24,7 +24,7 @@ import com.cloud.host.Host;
public class CheckOnHostCommand extends Command {
HostTO host;
- boolean reportCheckFailureIfOneStorageIsDown;
+ boolean reportIfHeartBeatFailedForOneStoragePool;
protected CheckOnHostCommand() {
}
@@ -34,17 +34,17 @@ public class CheckOnHostCommand extends Command {
setWait(20);
}
- public CheckOnHostCommand(Host host, boolean
reportCheckFailureIfOneStorageIsDown) {
+ public CheckOnHostCommand(Host host, boolean
reportIfHeartBeatFailedForOneStoragePool) {
this(host);
- this.reportCheckFailureIfOneStorageIsDown =
reportCheckFailureIfOneStorageIsDown;
+ this.reportIfHeartBeatFailedForOneStoragePool =
reportIfHeartBeatFailedForOneStoragePool;
}
public HostTO getHost() {
return host;
}
- public boolean isCheckFailedOnOneStorage() {
- return reportCheckFailureIfOneStorageIsDown;
+ public boolean shouldReportIfHeartBeatFailedForOneStoragePool() {
+ return reportIfHeartBeatFailedForOneStoragePool;
}
@Override
diff --git
a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java
b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java
index 3ae94479cea..53bfcce2703 100644
---
a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java
+++
b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java
@@ -75,10 +75,10 @@ public interface HighAvailabilityManager extends Manager {
+ " which are registered for the HA event that were successful and are
now ready to be purged.",
true, Cluster);
- public static final ConfigKey<Boolean>
KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced",
Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
+ ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new
ConfigKey<>("Advanced", Boolean.class,
"kvm.ha.fence.on.storage.heartbeat.failure", "false",
"Proceed fencing the host even the heartbeat failed for only one
storage pool", false, ConfigKey.Scope.Zone);
- public enum WorkType {
+ enum WorkType {
Migration, // Migrating VMs off of a host.
Stop, // Stops a VM for storage pool migration purposes. This
should be obsolete now.
CheckStop, // Checks if a VM has been stopped.
diff --git
a/plugins/hypervisors/hyperv/src/main/java/com/cloud/ha/HypervInvestigator.java
b/plugins/hypervisors/hyperv/src/main/java/com/cloud/ha/HypervInvestigator.java
index 3d79b9efdd1..4e44d8cb735 100644
---
a/plugins/hypervisors/hyperv/src/main/java/com/cloud/ha/HypervInvestigator.java
+++
b/plugins/hypervisors/hyperv/src/main/java/com/cloud/ha/HypervInvestigator.java
@@ -41,15 +41,15 @@ public class HypervInvestigator extends AdapterBase
implements Investigator {
@Override
public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws
UnknownVM {
- Status status = isAgentAlive(host);
+ Status status = getHostAgentStatus(host);
if (status == null) {
throw new UnknownVM();
}
- return status == Status.Up ? true : null;
+ return status == Status.Up;
}
@Override
- public Status isAgentAlive(Host agent) {
+ public Status getHostAgentStatus(Host agent) {
if (agent.getHypervisorType() != Hypervisor.HypervisorType.Hyperv) {
return null;
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java
index ce9fbe6c232..da9a0d6e291 100644
--- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java
+++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java
@@ -19,10 +19,7 @@
package com.cloud.ha;
import com.cloud.agent.AgentManager;
-import com.cloud.agent.api.Answer;
-import com.cloud.agent.api.CheckOnHostCommand;
import com.cloud.host.Host;
-import com.cloud.host.HostVO;
import com.cloud.host.Status;
import com.cloud.host.dao.HostDao;
import com.cloud.hypervisor.Hypervisor;
@@ -34,11 +31,12 @@ import
org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProvider;
import
org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProviderManager;
import
org.apache.cloudstack.engine.subsystem.api.storage.PrimaryDataStoreDriver;
import org.apache.cloudstack.ha.HAManager;
+import org.apache.cloudstack.kvm.ha.KVMHostActivityChecker;
import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao;
import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;
import javax.inject.Inject;
-import java.util.Arrays;
+import java.util.Collections;
import java.util.List;
public class KVMInvestigator extends AdapterBase implements Investigator {
@@ -54,13 +52,15 @@ public class KVMInvestigator extends AdapterBase implements
Investigator {
private HAManager haManager;
@Inject
private DataStoreProviderManager dataStoreProviderMgr;
+ @Inject
+ private KVMHostActivityChecker hostActivityChecker;
@Override
public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws
UnknownVM {
if (haManager.isHAEligible(host)) {
return haManager.isVMAliveOnHost(host);
}
- Status status = isAgentAlive(host);
+ Status status = getHostAgentStatus(host);
logger.debug("HA: HOST is ineligible legacy state {} for host {}",
status, host);
if (status == null) {
throw new UnknownVM();
@@ -73,86 +73,41 @@ public class KVMInvestigator extends AdapterBase implements
Investigator {
}
@Override
- public Status isAgentAlive(Host agent) {
- if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM &&
agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
+ public Status getHostAgentStatus(Host host) {
+ if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM &&
host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
return null;
}
- if (haManager.isHAEligible(agent)) {
- return haManager.getHostStatus(agent);
+ if (haManager.isHAEligible(host)) {
+ return haManager.getHostStatusFromHAConfig(host);
}
- List<StoragePoolVO> clusterPools =
_storagePoolDao.findPoolsInClusters(Arrays.asList(agent.getClusterId()), null);
- boolean storageSupportHA = storageSupportHa(clusterPools);
- if (!storageSupportHA) {
- List<StoragePoolVO> zonePools =
_storagePoolDao.findZoneWideStoragePoolsByHypervisor(agent.getDataCenterId(),
agent.getHypervisorType());
- storageSupportHA = storageSupportHa(zonePools);
+ List<StoragePoolVO> clusterPools =
_storagePoolDao.findPoolsInClusters(Collections.singletonList(host.getClusterId()),
null);
+ boolean storageSupportsHA = storageSupportsHA(clusterPools);
+ if (!storageSupportsHA) {
+ List<StoragePoolVO> zonePools =
_storagePoolDao.findZoneWideStoragePoolsByHypervisor(host.getDataCenterId(),
host.getHypervisorType());
+ storageSupportsHA = storageSupportsHA(zonePools);
}
- if (!storageSupportHA) {
- logger.warn("Agent investigation was requested on host {}, but
host does not support investigation because it has no NFS storage. Skipping
investigation.", agent);
+ if (!storageSupportsHA) {
+ logger.warn("Agent investigation was requested on host {}, but
host does not support investigation" +
+ " because it has no HA supported storage. Skipping
investigation.", host);
return null;
}
- Status hostStatus = null;
- Status neighbourStatus = null;
- boolean reportFailureIfOneStorageIsDown =
HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
- CheckOnHostCommand cmd = new CheckOnHostCommand(agent,
reportFailureIfOneStorageIsDown);
-
- try {
- Answer answer = _agentMgr.easySend(agent.getId(), cmd);
- if (answer != null) {
- hostStatus = answer.getResult() ? Status.Down : Status.Up;
- }
- } catch (Exception e) {
- logger.debug("Failed to send command to host: {}", agent);
- }
- if (hostStatus == null) {
- hostStatus = Status.Disconnected;
- }
-
- List<HostVO> neighbors =
_resourceMgr.listHostsInClusterByStatus(agent.getClusterId(), Status.Up);
- for (HostVO neighbor : neighbors) {
- if (neighbor.getId() == agent.getId()
- || (neighbor.getHypervisorType() !=
Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() !=
Hypervisor.HypervisorType.LXC)) {
- continue;
- }
- logger.debug("Investigating host:{} via neighbouring host:{}",
agent, neighbor);
- try {
- Answer answer = _agentMgr.easySend(neighbor.getId(), cmd);
- if (answer != null) {
- neighbourStatus = answer.getResult() ? Status.Down :
Status.Up;
- logger.debug("Neighbouring host:{} returned status:{} for
the investigated host:{}", neighbor, neighbourStatus, agent);
- if (neighbourStatus == Status.Up) {
- break;
- }
- }
- } catch (Exception e) {
- logger.debug("Failed to send command to host: {}", neighbor);
- }
- }
- if (neighbourStatus == Status.Up && (hostStatus == Status.Disconnected
|| hostStatus == Status.Down)) {
- hostStatus = Status.Disconnected;
- }
- if (neighbourStatus == Status.Down && (hostStatus ==
Status.Disconnected || hostStatus == Status.Down)) {
- hostStatus = Status.Down;
- }
- logger.debug("HA: HOST is ineligible legacy state {} for host {}",
hostStatus, agent);
- return hostStatus;
+ return hostActivityChecker.getHostAgentStatus(host);
}
- private boolean storageSupportHa(List<StoragePoolVO> pools) {
- boolean storageSupportHA = false;
+ private boolean storageSupportsHA(List<StoragePoolVO> pools) {
for (StoragePoolVO pool : pools) {
DataStoreProvider storeProvider =
dataStoreProviderMgr.getDataStoreProvider(pool.getStorageProviderName());
DataStoreDriver storeDriver = storeProvider.getDataStoreDriver();
if (storeDriver instanceof PrimaryDataStoreDriver) {
PrimaryDataStoreDriver primaryStoreDriver =
(PrimaryDataStoreDriver)storeDriver;
if (primaryStoreDriver.isStorageSupportHA(pool.getPoolType()))
{
- storageSupportHA = true;
- break;
+ return true;
}
}
}
- return storageSupportHA;
+ return false;
}
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHABase.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHABase.java
index e9a7ac8951c..f30e2c77919 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHABase.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHABase.java
@@ -35,10 +35,9 @@ import com.cloud.agent.properties.AgentPropertiesFileHandler;
public class KVMHABase {
protected Logger logger = LogManager.getLogger(getClass());
private long _timeout = 60000; /* 1 minutes */
- protected long _heartBeatUpdateTimeout =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT);
- protected long _heartBeatUpdateFreq =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
+ protected long _heartBeatUpdateFreqInMs =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
protected long _heartBeatUpdateMaxTries =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_MAX_TRIES);
- protected long _heartBeatUpdateRetrySleep =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
+ protected long _heartBeatUpdateRetrySleepInMs =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
public static enum PoolType {
PrimaryStorage, SecondaryStorage
@@ -138,7 +137,7 @@ public class KVMHABase {
/* Can't find the mount point? */
/* we need to mount it under poolName */
if (poolName != null) {
- Script mount = new Script("/bin/bash", 60000);
+ Script mount = new Script("/bin/bash", _timeout);
mount.add("-c");
mount.add("mount " + mountSource + " " + destPath);
String result = mount.execute();
@@ -154,7 +153,6 @@ public class KVMHABase {
}
protected String getMountPoint(HAStoragePool storagePool) {
-
StoragePool pool = null;
String poolName = null;
try {
@@ -171,7 +169,6 @@ public class KVMHABase {
}
poolName = pool.getName();
}
-
} catch (LibvirtException e) {
logger.debug("Ignoring libvirt error.", e);
} finally {
@@ -234,7 +231,7 @@ public class KVMHABase {
return result;
}
- public Boolean checkingHeartBeat() {
+ public Boolean hasHeartBeat() {
// TODO Auto-generated method stub
return null;
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java
index db6190fa8f2..0ee59c95da3 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java
@@ -26,44 +26,43 @@ import com.cloud.agent.api.to.HostTO;
public class KVMHAChecker extends KVMHABase implements Callable<Boolean> {
private List<HAStoragePool> storagePools;
private HostTO host;
- private boolean reportFailureIfOneStorageIsDown;
+ private boolean reportIfHeartBeatFailedForOneStoragePool;
- public KVMHAChecker(List<HAStoragePool> pools, HostTO host, boolean
reportFailureIfOneStorageIsDown) {
+ public KVMHAChecker(List<HAStoragePool> pools, HostTO host, boolean
reportIfHeartBeatFailedForOneStoragePool) {
this.storagePools = pools;
this.host = host;
- this.reportFailureIfOneStorageIsDown = reportFailureIfOneStorageIsDown;
+ this.reportIfHeartBeatFailedForOneStoragePool =
reportIfHeartBeatFailedForOneStoragePool;
}
/*
- * True means heartbeaing is on going, or we can't get it's status. False
- * means heartbeating is stopped definitely
+ * True means heart beating is on going, or we can't get it's status.
+ * False means heart beating is stopped definitely.
*/
@Override
- public Boolean checkingHeartBeat() {
- boolean validResult = false;
-
- String hostAndPools = String.format("host IP [%s] in pools [%s]",
host.getPrivateNetwork().getIp(), storagePools.stream().map(pool ->
pool.getPoolUUID()).collect(Collectors.joining(", ")));
-
- logger.debug(String.format("Checking heart beat with KVMHAChecker for
%s", hostAndPools));
+ public Boolean hasHeartBeat() {
+ String hostAndPools = String.format("host IP [%s] in pools [%s]",
host.getPrivateNetwork().getIp(),
+ storagePools.stream().map(pool ->
pool.getPoolUUID()).collect(Collectors.joining(", ")));
+ logger.debug("Checking heart beat with KVMHAChecker for {}",
hostAndPools);
+ boolean heartBeatCheckResult = false;
for (HAStoragePool pool : storagePools) {
- validResult = pool.getPool().checkingHeartBeat(pool, host);
- if (reportFailureIfOneStorageIsDown && !validResult) {
+ heartBeatCheckResult = pool.getPool().hasHeartBeat(pool, host);
+ if (reportIfHeartBeatFailedForOneStoragePool &&
!heartBeatCheckResult) {
break;
}
}
- if (!validResult) {
- logger.warn(String.format("All checks with KVMHAChecker for %s
considered it as dead. It may cause a shutdown of the host.", hostAndPools));
+ if (!heartBeatCheckResult) {
+ logger.warn("All checks with KVMHAChecker for {} considered it as
dead. It may cause a shutdown of the host.", hostAndPools);
}
- return validResult;
+ return heartBeatCheckResult;
}
@Override
public Boolean call() throws Exception {
// logger.addAppender(new org.apache.log4j.ConsoleAppender(new
// org.apache.log4j.PatternLayout(), "System.out"));
- return checkingHeartBeat();
+ return hasHeartBeat();
}
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java
index aa868ff1d3f..9f1b849e972 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java
@@ -34,53 +34,49 @@ import java.util.concurrent.ConcurrentHashMap;
public class KVMHAMonitor extends KVMHABase implements Runnable {
- private final Map<String, HAStoragePool> storagePool = new
ConcurrentHashMap<>();
+ private final Map<String, HAStoragePool> haStoragePools = new
ConcurrentHashMap<>();
private final boolean rebootHostAndAlertManagementOnHeartbeatTimeout;
private final String hostPrivateIp;
- public KVMHAMonitor(HAStoragePool pool, String host) {
- if (pool != null) {
- storagePool.put(pool.getPoolUUID(), pool);
- }
+ public KVMHAMonitor(String host) {
hostPrivateIp = host;
-
rebootHostAndAlertManagementOnHeartbeatTimeout =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT);
}
public void addStoragePool(HAStoragePool pool) {
- synchronized (storagePool) {
- storagePool.put(pool.getPoolUUID(), pool);
+ synchronized (haStoragePools) {
+ haStoragePools.put(pool.getPoolUUID(), pool);
}
}
public void removeStoragePool(String uuid) {
- synchronized (storagePool) {
- HAStoragePool pool = storagePool.get(uuid);
+ synchronized (haStoragePools) {
+ HAStoragePool pool = haStoragePools.get(uuid);
if (pool != null) {
Script.runSimpleBashScript("umount " +
pool.getMountDestPath());
- storagePool.remove(uuid);
+ haStoragePools.remove(uuid);
}
}
}
public List<HAStoragePool> getStoragePools() {
- synchronized (storagePool) {
- return new ArrayList<>(storagePool.values());
+ synchronized (haStoragePools) {
+ return new ArrayList<>(haStoragePools.values());
}
}
public HAStoragePool getStoragePool(String uuid) {
- synchronized (storagePool) {
- return storagePool.get(uuid);
+ synchronized (haStoragePools) {
+ return haStoragePools.get(uuid);
}
}
protected void runHeartBeat() {
- synchronized (storagePool) {
+ synchronized (haStoragePools) {
Set<String> removedPools = new HashSet<>();
- for (String uuid : storagePool.keySet()) {
- HAStoragePool primaryStoragePool = storagePool.get(uuid);
+ for (String uuid : haStoragePools.keySet()) {
+ HAStoragePool primaryStoragePool = haStoragePools.get(uuid);
if
(HighAvailabilityManager.LIBVIRT_STORAGE_POOL_TYPES_WITH_HA_SUPPORT.contains(primaryStoragePool.getPool().getType()))
{
checkForNotExistingLibvirtStoragePools(removedPools, uuid);
if (removedPools.contains(uuid)) {
@@ -91,7 +87,7 @@ public class KVMHAMonitor extends KVMHABase implements
Runnable {
result = executePoolHeartBeatCommand(uuid, primaryStoragePool,
result);
if (result != null &&
rebootHostAndAlertManagementOnHeartbeatTimeout) {
- logger.warn(String.format("Write heartbeat for pool [%s]
failed: %s; stopping cloudstack-agent.", uuid, result));
+ logger.warn("Write heartbeat for pool [{}] failed: {};
stopping cloudstack-agent.", uuid, result);
primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool, null,
false);;
}
}
@@ -104,20 +100,18 @@ public class KVMHAMonitor extends KVMHABase implements
Runnable {
}
private String executePoolHeartBeatCommand(String uuid, HAStoragePool
primaryStoragePool, String result) {
- for (int i = 1; i <= _heartBeatUpdateMaxTries; i++) {
+ for (int attempt = 1; attempt <= _heartBeatUpdateMaxTries; attempt++) {
result =
primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool,
hostPrivateIp, true);
-
- if (result != null) {
- logger.warn(String.format("Write heartbeat for pool [%s]
failed: %s; try: %s of %s.", uuid, result, i, _heartBeatUpdateMaxTries));
- try {
- Thread.sleep(_heartBeatUpdateRetrySleep);
- } catch (InterruptedException e) {
- logger.debug("[IGNORED] Interrupted between heartbeat
retries.", e);
- }
- } else {
+ if (result == null) {
break;
}
+ logger.warn("Write heartbeat for pool [{}] failed: {}; try: {} of
{}.", uuid, result, attempt, _heartBeatUpdateMaxTries);
+ try {
+ Thread.sleep(_heartBeatUpdateRetrySleepInMs);
+ } catch (InterruptedException e) {
+ logger.debug("[IGNORED] Interrupted between heartbeat
retries.", e);
+ }
}
return result;
}
@@ -128,21 +122,21 @@ public class KVMHAMonitor extends KVMHABase implements
Runnable {
StoragePool storage = conn.storagePoolLookupByUUIDString(uuid);
if (storage == null || storage.getInfo().state !=
StoragePoolState.VIR_STORAGE_POOL_RUNNING) {
if (storage == null) {
- logger.debug(String.format("Libvirt storage pool [%s] not
found, removing from HA list.", uuid));
+ logger.debug("Libvirt storage pool [{}] not found,
removing from HA list.", uuid);
} else {
- logger.debug(String.format("Libvirt storage pool [%s]
found, but not running, removing from HA list.", uuid));
+ logger.debug("Libvirt storage pool [{}] found, but not
running, removing from HA list.", uuid);
}
removedPools.add(uuid);
}
- logger.debug(String.format("Found NFS storage pool [%s] in
libvirt, continuing.", uuid));
+ logger.debug("Found NFS storage pool [{}] in libvirt,
continuing.", uuid);
} catch (LibvirtException e) {
- logger.debug(String.format("Failed to lookup libvirt storage pool
[%s].", uuid), e);
+ logger.debug("Failed to lookup libvirt storage pool [{}].", uuid,
e);
if (e.toString().contains("pool not found")) {
- logger.debug(String.format("Removing pool [%s] from HA monitor
since it was deleted.", uuid));
+ logger.debug("Removing pool [{}] from HA monitor since it was
deleted.", uuid);
removedPools.add(uuid);
}
}
@@ -155,11 +149,10 @@ public class KVMHAMonitor extends KVMHABase implements
Runnable {
runHeartBeat();
try {
- Thread.sleep(_heartBeatUpdateFreq);
+ Thread.sleep(_heartBeatUpdateFreqInMs);
} catch (InterruptedException e) {
logger.debug("[IGNORED] Interrupted between heartbeats.", e);
}
}
}
-
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAVMActivityChecker.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAVMActivityChecker.java
index e6937b515e9..c13be64a3a3 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAVMActivityChecker.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAVMActivityChecker.java
@@ -39,12 +39,12 @@ public class KVMHAVMActivityChecker extends KVMHABase
implements Callable<Boolea
}
@Override
- public Boolean checkingHeartBeat() {
- return this.storagePool.getPool().vmActivityCheck(storagePool, host,
activityScriptTimeout, volumeUuidList, vmActivityCheckPath,
suspectTimeInSeconds);
+ public Boolean hasHeartBeat() {
+ return this.storagePool.getPool().hasVmActivity(storagePool, host,
activityScriptTimeout, volumeUuidList, vmActivityCheckPath,
suspectTimeInSeconds);
}
@Override
public Boolean call() throws Exception {
- return checkingHeartBeat();
+ return hasHeartBeat();
}
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java
index 25162ca9b92..80d9a51cb85 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java
@@ -1386,9 +1386,9 @@ public class LibvirtComputingResource extends
ServerResourceBase implements Serv
final String[] info = NetUtils.getNetworkParams(privateNic);
- kvmhaMonitor = new KVMHAMonitor(null, info[0]);
- final Thread ha = new Thread(kvmhaMonitor);
- ha.start();
+ kvmhaMonitor = new KVMHAMonitor(info[0]);
+ final Thread haMonitorThread = new Thread(kvmhaMonitor);
+ haMonitorThread.start();
storagePoolManager = new KVMStoragePoolManager(storageLayer,
kvmhaMonitor);
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java
index 48996a7ba97..f901fd97ca7 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java
@@ -26,6 +26,7 @@ import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import com.cloud.agent.api.Answer;
+import com.cloud.agent.api.CheckOnHostAnswer;
import com.cloud.agent.api.CheckOnHostCommand;
import com.cloud.agent.api.to.HostTO;
import com.cloud.hypervisor.kvm.resource.KVMHABase.HAStoragePool;
@@ -45,20 +46,21 @@ public final class LibvirtCheckOnHostCommandWrapper extends
CommandWrapper<Check
final List<HAStoragePool> pools = monitor.getStoragePools();
final HostTO host = command.getHost();
- final KVMHAChecker ha = new KVMHAChecker(pools, host,
command.isCheckFailedOnOneStorage());
+
+ final KVMHAChecker ha = new KVMHAChecker(pools, host,
command.shouldReportIfHeartBeatFailedForOneStoragePool());
final Future<Boolean> future = executors.submit(ha);
try {
- final Boolean result = future.get();
- if (result) {
- return new Answer(command, false, "Heart is beating...");
+ final Boolean hasHeartBeat = future.get();
+ if (hasHeartBeat) {
+ return new CheckOnHostAnswer(command, true, "Heart is
beating");
} else {
- return new Answer(command);
+ return new CheckOnHostAnswer(command, "Heart is not beating");
}
} catch (final InterruptedException e) {
- return new Answer(command, false, "CheckOnHostCommand: can't get
status of host: InterruptedException");
+ return new CheckOnHostAnswer(command, "CheckOnHostCommand: can't
get status of host: InterruptedException");
} catch (final ExecutionException e) {
- return new Answer(command, false, "CheckOnHostCommand: can't get
status of host: ExecutionException");
+ return new CheckOnHostAnswer(command, "CheckOnHostCommand: can't
get status of host: ExecutionException");
}
}
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckVMActivityOnStoragePoolCommandWrapper.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckVMActivityOnStoragePoolCommandWrapper.java
index d3f537dc917..b132f6d98ce 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckVMActivityOnStoragePoolCommandWrapper.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckVMActivityOnStoragePoolCommandWrapper.java
@@ -49,8 +49,8 @@ public final class
LibvirtCheckVMActivityOnStoragePoolCommandWrapper extends Com
KVMStoragePool primaryPool =
storagePoolMgr.getStoragePool(pool.getType(), pool.getUuid());
if (primaryPool.isPoolSupportHA()) {
- final HAStoragePool nfspool =
monitor.getStoragePool(pool.getUuid());
- final KVMHAVMActivityChecker ha = new
KVMHAVMActivityChecker(nfspool, command.getHost(), command.getVolumeList(),
libvirtComputingResource.getVmActivityCheckPath(),
command.getSuspectTimeInSeconds());
+ final HAStoragePool haPool =
monitor.getStoragePool(pool.getUuid());
+ final KVMHAVMActivityChecker ha = new
KVMHAVMActivityChecker(haPool, command.getHost(), command.getVolumeList(),
libvirtComputingResource.getVmActivityCheckPath(),
command.getSuspectTimeInSeconds());
final Future<Boolean> future = executors.submit(ha);
try {
final Boolean result = future.get();
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/IscsiAdmStoragePool.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/IscsiAdmStoragePool.java
index f5bfd898a4f..8cf6de68f95 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/IscsiAdmStoragePool.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/IscsiAdmStoragePool.java
@@ -208,12 +208,12 @@ public class IscsiAdmStoragePool implements
KVMStoragePool {
}
@Override
- public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
+ public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
return null;
}
@Override
- public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath,
long duration) {
+ public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath,
long duration) {
return null;
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/KVMStoragePool.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/KVMStoragePool.java
index 8dd2116e123..3e35ed9476b 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/KVMStoragePool.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/KVMStoragePool.java
@@ -33,35 +33,33 @@ import com.cloud.storage.Storage.StoragePoolType;
public interface KVMStoragePool {
- public static final long HeartBeatUpdateTimeout =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT);
- public static final long HeartBeatUpdateFreq =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
- public static final long HeartBeatUpdateMaxTries =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_MAX_TRIES);
- public static final long HeartBeatUpdateRetrySleep =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
- public static final long HeartBeatCheckerTimeout =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_CHECKER_TIMEOUT);
+ long HeartBeatUpdateTimeoutInMs =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT);
+ long HeartBeatUpdateFreqInMs =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
+ long HeartBeatCheckerTimeoutInMs =
AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_CHECKER_TIMEOUT);
- public default KVMPhysicalDisk createPhysicalDisk(String volumeUuid,
PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long
size, Long usableSize, byte[] passphrase) {
+ default KVMPhysicalDisk createPhysicalDisk(String volumeUuid,
PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long
size, Long usableSize, byte[] passphrase) {
return createPhysicalDisk(volumeUuid, format, provisioningType, size,
passphrase);
}
- public KVMPhysicalDisk createPhysicalDisk(String volumeUuid,
PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long
size, byte[] passphrase);
+ KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat
format, Storage.ProvisioningType provisioningType, long size, byte[]
passphrase);
- public KVMPhysicalDisk createPhysicalDisk(String volumeUuid,
Storage.ProvisioningType provisioningType, long size, byte[] passphrase);
+ KVMPhysicalDisk createPhysicalDisk(String volumeUuid,
Storage.ProvisioningType provisioningType, long size, byte[] passphrase);
- public boolean connectPhysicalDisk(String volumeUuid, Map<String, String>
details);
+ boolean connectPhysicalDisk(String volumeUuid, Map<String, String>
details);
- public KVMPhysicalDisk getPhysicalDisk(String volumeUuid);
+ KVMPhysicalDisk getPhysicalDisk(String volumeUuid);
- public boolean disconnectPhysicalDisk(String volumeUuid);
+ boolean disconnectPhysicalDisk(String volumeUuid);
- public boolean deletePhysicalDisk(String volumeUuid, Storage.ImageFormat
format);
+ boolean deletePhysicalDisk(String volumeUuid, Storage.ImageFormat format);
- public List<KVMPhysicalDisk> listPhysicalDisks();
+ List<KVMPhysicalDisk> listPhysicalDisks();
- public String getUuid();
+ String getUuid();
- public long getCapacity();
+ long getCapacity();
- public long getUsed();
+ long getUsed();
default Long getCapacityIops() {
return null;
@@ -71,51 +69,51 @@ public interface KVMStoragePool {
return null;
}
- public long getAvailable();
+ long getAvailable();
- public boolean refresh();
+ boolean refresh();
- public boolean isExternalSnapshot();
+ boolean isExternalSnapshot();
- public String getLocalPath();
+ String getLocalPath();
- public String getSourceHost();
+ String getSourceHost();
- public String getSourceDir();
+ String getSourceDir();
- public int getSourcePort();
+ int getSourcePort();
- public String getAuthUserName();
+ String getAuthUserName();
- public String getAuthSecret();
+ String getAuthSecret();
- public StoragePoolType getType();
+ StoragePoolType getType();
- public boolean delete();
+ boolean delete();
PhysicalDiskFormat getDefaultFormat();
- public boolean createFolder(String path);
+ boolean createFolder(String path);
- public boolean supportsConfigDriveIso();
+ boolean supportsConfigDriveIso();
- public Map<String, String> getDetails();
+ Map<String, String> getDetails();
default String getLocalPathFor(String relativePath) {
return String.format("%s%s%s", getLocalPath(), File.separator,
relativePath);
}
- public boolean isPoolSupportHA();
+ boolean isPoolSupportHA();
- public String getHearthBeatPath();
+ String getHearthBeatPath();
- public String createHeartBeatCommand(HAStoragePool primaryStoragePool,
String hostPrivateIp, boolean hostValidation);
+ String createHeartBeatCommand(HAStoragePool primaryStoragePool, String
hostPrivateIp, boolean hostValidation);
- public String getStorageNodeId();
+ String getStorageNodeId();
- public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host);
+ Boolean hasHeartBeat(HAStoragePool pool, HostTO host);
- public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath,
long duration);
+ Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath,
long duration);
default LibvirtVMDef.DiskDef.BlockIOSize getSupportedLogicalBlockSize() {
return null;
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/LibvirtStoragePool.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/LibvirtStoragePool.java
index 45c22d3ac75..910f0eb15e0 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/LibvirtStoragePool.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/LibvirtStoragePool.java
@@ -345,16 +345,14 @@ public class LibvirtStoragePool implements KVMStoragePool
{
public String createHeartBeatCommand(HAStoragePool primaryStoragePool,
String hostPrivateIp, boolean hostValidation) {
- Script cmd = new
Script(primaryStoragePool.getPool().getHearthBeatPath(),
HeartBeatUpdateTimeout, logger);
+ Script cmd = new
Script(primaryStoragePool.getPool().getHearthBeatPath(),
HeartBeatUpdateTimeoutInMs, logger);
cmd.add("-i", primaryStoragePool.getPoolIp());
cmd.add("-p", primaryStoragePool.getPoolMountSourcePath());
cmd.add("-m", primaryStoragePool.getMountDestPath());
if (hostValidation) {
cmd.add("-h", hostPrivateIp);
- }
-
- if (!hostValidation) {
+ } else {
cmd.add("-c");
}
@@ -372,53 +370,53 @@ public class LibvirtStoragePool implements KVMStoragePool
{
}
@Override
- public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
- boolean validResult = false;
+ public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
String hostIp = host.getPrivateNetwork().getIp();
- Script cmd = new Script(getHearthBeatPath(), HeartBeatCheckerTimeout,
logger);
+ Script cmd = new Script(getHearthBeatPath(),
HeartBeatCheckerTimeoutInMs, logger);
cmd.add("-i", pool.getPoolIp());
cmd.add("-p", pool.getPoolMountSourcePath());
cmd.add("-m", pool.getMountDestPath());
cmd.add("-h", hostIp);
cmd.add("-r");
- cmd.add("-t", String.valueOf(HeartBeatUpdateFreq / 1000));
+ cmd.add("-t", String.valueOf(HeartBeatUpdateFreqInMs / 1000));
OutputInterpreter.OneLineParser parser = new
OutputInterpreter.OneLineParser();
String result = cmd.execute(parser);
String parsedLine = parser.getLine();
- logger.debug(String.format("Checking heart beat with KVMHAChecker
[{command=\"%s\", result: \"%s\", log: \"%s\", pool: \"%s\"}].",
cmd.toString(), result, parsedLine,
- pool.getPoolIp()));
+ logger.debug("Checking heart beat for host IP {} with KVMHAChecker
[{command=\"{}\", result: \"{}\", log: \"{}\", pool: \"{}\"}].", hostIp,
cmd.toString(), result, parsedLine, pool.getPoolIp());
if (result == null && parsedLine.contains("DEAD")) {
- logger.warn(String.format("Checking heart beat with KVMHAChecker
command [%s] returned [%s]. [%s]. It may cause a shutdown of host IP [%s].",
cmd.toString(),
- result, parsedLine, hostIp));
+ logger.warn("Checking heart beat for host IP {} with KVMHAChecker
command [{}] returned [{}]. It may cause a shutdown of the host.", hostIp,
cmd.toString(), parsedLine);
+ return false;
} else {
- validResult = true;
+ logger.debug("Checking heart beat for host IP {} with KVMHAChecker
command [{}] succeeded.", hostIp, cmd.toString());
+ return true;
}
- return validResult;
}
@Override
- public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath,
long duration) {
+ public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath,
long duration) {
+ String hostIp = host.getPrivateNetwork().getIp();
Script cmd = new Script(vmActivityCheckPath,
activityScriptTimeout.getStandardSeconds(), logger);
cmd.add("-i", pool.getPoolIp());
cmd.add("-p", pool.getPoolMountSourcePath());
cmd.add("-m", pool.getMountDestPath());
- cmd.add("-h", host.getPrivateNetwork().getIp());
+ cmd.add("-h", hostIp);
cmd.add("-u", volumeUUIDListString);
- cmd.add("-t", String.valueOf(String.valueOf(System.currentTimeMillis()
/ 1000)));
+ cmd.add("-t", String.valueOf(System.currentTimeMillis() / 1000));
cmd.add("-d", String.valueOf(duration));
OutputInterpreter.OneLineParser parser = new
OutputInterpreter.OneLineParser();
String result = cmd.execute(parser);
String parsedLine = parser.getLine();
- logger.debug(String.format("Checking heart beat with
KVMHAVMActivityChecker [{command=\"%s\", result: \"%s\", log: \"%s\", pool:
\"%s\"}].", cmd.toString(), result, parsedLine, pool.getPoolIp()));
+ logger.debug("Checking VM activity for host IP {} with
KVMHAVMActivityChecker [{command=\"{}\", result: \"{}\", log: \"{}\", pool:
\"{}\"}].", hostIp, cmd.toString(), result, parsedLine, pool.getPoolIp());
if (result == null && parsedLine.contains("DEAD")) {
- logger.warn(String.format("Checking heart beat with
KVMHAVMActivityChecker command [%s] returned [%s]. It is [%s]. It may cause a
shutdown of host IP [%s].", cmd.toString(), result, parsedLine,
host.getPrivateNetwork().getIp()));
+ logger.warn("Checking VM activity for host IP {} with
KVMHAVMActivityChecker command [{}] returned [{}]. It may cause a shutdown of
the host.", hostIp, cmd.toString(), parsedLine);
return false;
} else {
+ logger.debug("Checking VM activity for host IP {} with
KVMHAVMActivityChecker command [{}] succeeded.", hostIp, cmd.toString());
return true;
}
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/MultipathSCSIPool.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/MultipathSCSIPool.java
index 229481b1f79..df9986c9919 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/MultipathSCSIPool.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/MultipathSCSIPool.java
@@ -225,13 +225,13 @@ public class MultipathSCSIPool implements KVMStoragePool {
}
@Override
- public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
+ public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
return null;
}
@Override
- public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout,
- String volumeUUIDListString, String vmActivityCheckPath, long
duration) {
+ public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout,
+ String volumeUUIDListString, String
vmActivityCheckPath, long duration) {
return null;
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/ScaleIOStoragePool.java
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/ScaleIOStoragePool.java
index e8243c3f7cf..fc512ff94a9 100644
---
a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/ScaleIOStoragePool.java
+++
b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/ScaleIOStoragePool.java
@@ -236,12 +236,12 @@ public class ScaleIOStoragePool implements KVMStoragePool
{
}
@Override
- public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
+ public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
return null;
}
@Override
- public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath,
long duration) {
+ public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath,
long duration) {
return null;
}
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java
b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java
index 59ea720328f..3fbb5340fcc 100644
---
a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java
+++
b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java
@@ -19,38 +19,37 @@ package org.apache.cloudstack.kvm.ha;
import org.apache.cloudstack.framework.config.ConfigKey;
-public class KVMHAConfig {
+public interface KVMHAConfig {
- public static final ConfigKey<Long> KvmHAHealthCheckTimeout = new
ConfigKey<>("Advanced", Long.class, "kvm.ha.health.check.timeout", "10",
+ ConfigKey<Long> KvmHAHealthCheckTimeout = new ConfigKey<>("Advanced",
Long.class, "kvm.ha.health.check.timeout", "10",
"The maximum length of time, in seconds, expected for an health
check to complete.", true, ConfigKey.Scope.Cluster);
- public static final ConfigKey<Long> KvmHAActivityCheckTimeout = new
ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.timeout", "60",
+ ConfigKey<Long> KvmHAActivityCheckTimeout = new ConfigKey<>("Advanced",
Long.class, "kvm.ha.activity.check.timeout", "60",
"The maximum length of time, in seconds, expected for an activity
check to complete.", true, ConfigKey.Scope.Cluster);
- public static final ConfigKey<Long> KvmHAActivityCheckInterval = new
ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.interval", "60",
+ ConfigKey<Long> KvmHAActivityCheckInterval = new ConfigKey<>("Advanced",
Long.class, "kvm.ha.activity.check.interval", "60",
"The interval, in seconds, between activity checks.", true,
ConfigKey.Scope.Cluster);
- public static final ConfigKey<Long> KvmHAActivityCheckMaxAttempts = new
ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.max.attempts", "10",
+ ConfigKey<Long> KvmHAActivityCheckMaxAttempts = new
ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.max.attempts", "10",
"The maximum number of activity check attempts to perform before
deciding to recover or degrade a resource.", true, ConfigKey.Scope.Cluster);
- public static final ConfigKey<Double> KvmHAActivityCheckFailureThreshold =
new ConfigKey<>("Advanced", Double.class,
"kvm.ha.activity.check.failure.ratio", "0.7",
+ ConfigKey<Double> KvmHAActivityCheckFailureThreshold = new
ConfigKey<>("Advanced", Double.class, "kvm.ha.activity.check.failure.ratio",
"0.7",
"The activity check failure threshold ratio. This is used with the
activity check maximum attempts for deciding to recover or degrade a resource.
For most environments, please keep this value above 0.5.",
true, ConfigKey.Scope.Cluster);
- public static final ConfigKey<Long> KvmHADegradedMaxPeriod = new
ConfigKey<>("Advanced", Long.class, "kvm.ha.degraded.max.period", "300",
+ ConfigKey<Long> KvmHADegradedMaxPeriod = new ConfigKey<>("Advanced",
Long.class, "kvm.ha.degraded.max.period", "300",
"The maximum length of time, in seconds, a resource can be in
degraded state where only health checks are performed.", true,
ConfigKey.Scope.Cluster);
- public static final ConfigKey<Long> KvmHARecoverTimeout = new
ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.timeout", "60",
+ ConfigKey<Long> KvmHARecoverTimeout = new ConfigKey<>("Advanced",
Long.class, "kvm.ha.recover.timeout", "60",
"The maximum length of time, in seconds, expected for a recovery
operation to complete.", true, ConfigKey.Scope.Cluster);
- public static final ConfigKey<Long> KvmHARecoverWaitPeriod = new
ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.wait.period", "600",
+ ConfigKey<Long> KvmHARecoverWaitPeriod = new ConfigKey<>("Advanced",
Long.class, "kvm.ha.recover.wait.period", "600",
"The maximum length of time, in seconds, to wait for a resource to
recover.", true, ConfigKey.Scope.Cluster);
- public static final ConfigKey<Long> KvmHARecoverAttemptThreshold = new
ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.failure.threshold", "1",
+ ConfigKey<Long> KvmHARecoverAttemptThreshold = new ConfigKey<>("Advanced",
Long.class, "kvm.ha.recover.failure.threshold", "1",
"The maximum recovery attempts to be made for a resource, after
which the resource is fenced. The recovery counter resets when a health check
passes for a resource.",
true, ConfigKey.Scope.Cluster);
- public static final ConfigKey<Long> KvmHAFenceTimeout = new
ConfigKey<>("Advanced", Long.class, "kvm.ha.fence.timeout", "60",
+ ConfigKey<Long> KvmHAFenceTimeout = new ConfigKey<>("Advanced",
Long.class, "kvm.ha.fence.timeout", "60",
"The maximum length of time, in seconds, expected for a fence
operation to complete.", true, ConfigKey.Scope.Cluster);
-
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java
b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java
index b937be5265b..f0b5cfc337d 100644
---
a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java
+++
b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAProvider.java
@@ -68,17 +68,18 @@ public final class KVMHAProvider extends
HAAbstractHostProvider implements HAPro
@Override
public boolean recover(Host r) throws HARecoveryException {
+ logger.debug("Recover the host {}", r);
try {
- if (outOfBandManagementService.isOutOfBandManagementEnabled(r)){
+ if (outOfBandManagementService.isOutOfBandManagementEnabled(r)) {
final OutOfBandManagementResponse resp =
outOfBandManagementService.executePowerOperation(r, PowerOperation.RESET, null);
return resp.getSuccess();
} else {
logger.warn("OOBM recover operation failed for the host {}",
r);
return false;
}
- } catch (Exception e){
+ } catch (Exception e) {
logger.warn("OOBM service is not configured or enabled for this
host {} error is {}", r, e.getMessage());
- throw new HARecoveryException(String.format(" OOBM service is not
configured or enabled for this host %s", r), e);
+ throw new HARecoveryException(String.format("OOBM service is not
configured or enabled for this host %s", r), e);
}
}
diff --git
a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java
b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java
index 31f87d7e044..af7441c4fd2 100644
---
a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java
+++
b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java
@@ -19,6 +19,7 @@ package org.apache.cloudstack.kvm.ha;
import com.cloud.agent.AgentManager;
import com.cloud.agent.api.Answer;
+import com.cloud.agent.api.CheckOnHostAnswer;
import com.cloud.agent.api.CheckOnHostCommand;
import com.cloud.agent.api.CheckVMActivityOnStoragePoolCommand;
import com.cloud.dc.dao.ClusterDao;
@@ -61,7 +62,7 @@ public class KVMHostActivityChecker extends AdapterBase
implements ActivityCheck
@Inject
private AgentManager agentMgr;
@Inject
- private PrimaryDataStoreDao storagePool;
+ private PrimaryDataStoreDao storagePoolDao;
@Inject
private StorageManager storageManager;
@Inject
@@ -70,11 +71,11 @@ public class KVMHostActivityChecker extends AdapterBase
implements ActivityCheck
@Override
public boolean isActive(Host r, DateTime suspectTime) throws
HACheckerException {
try {
- return isVMActivityOnHost(r, suspectTime);
+ return hasVMActivityOnHost(r, suspectTime);
} catch (HACheckerException e) {
- //Re-throwing the exception to avoid poluting the
'HACheckerException' already thrown
+ //Re-throwing the exception to avoid polluting the
'HACheckerException' already thrown
throw e;
- } catch (Exception e){
+ } catch (Exception e) {
String message = String.format("Operation timed out, probably the
%s is not reachable.", r.toString());
logger.warn(message, e);
throw new HACheckerException(message, e);
@@ -83,82 +84,115 @@ public class KVMHostActivityChecker extends AdapterBase
implements ActivityCheck
@Override
public boolean isHealthy(Host r) {
- return isAgentActive(r);
+ return isHostAgentUp(r);
}
- private boolean isAgentActive(Host agent) {
- if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM &&
agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
- throw new IllegalStateException(String.format("Calling KVM
investigator for non KVM Host of type [%s].", agent.getHypervisorType()));
+ private boolean isHostAgentUp(Host host) {
+ if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM &&
host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
+ throw new IllegalStateException(String.format("Calling KVM
investigator for non KVM Host of type [%s].", host.getHypervisorType()));
+ }
+
+ Status hostStatus = getHostAgentStatus(host);
+
+ logger.debug("{} has the status [{}].", host.toString(), hostStatus);
+ return hostStatus == Status.Up;
+ }
+
+ public Status getHostAgentStatus(Host host) {
+ if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM &&
host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
+ return null;
+ }
+
+ Status hostStatusFromItself = checkHostStatusWithSameHost(host);
+ if (hostStatusFromItself == Status.Up) {
+ return Status.Up;
}
- Status hostStatus = Status.Unknown;
- Status neighbourStatus = Status.Unknown;
- final CheckOnHostCommand cmd = new CheckOnHostCommand(agent,
HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value());
+
+ Status hostStatusFromNeighbour =
checkHostStatusWithNeighbourHosts(host);
+ Status hostStatus = hostStatusFromItself;
+ if (hostStatusFromNeighbour == Status.Up && (hostStatusFromItself ==
Status.Disconnected || hostStatusFromItself == Status.Down)) {
+ hostStatus = Status.Disconnected;
+ }
+ if (hostStatusFromNeighbour == Status.Down && (hostStatusFromItself ==
Status.Disconnected || hostStatusFromItself == Status.Down)) {
+ hostStatus = Status.Down;
+ }
+
+ logger.debug("HA: HOST is ineligible legacy state {} for host {}",
hostStatus, host);
+ return hostStatus;
+ }
+
+ private Status checkHostStatusWithSameHost(Host host) {
+ Status hostStatus;
+ boolean reportFailureIfOneStorageIsDown =
HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
+ final CheckOnHostCommand cmd = new CheckOnHostCommand(host,
reportFailureIfOneStorageIsDown);
try {
- logger.debug(String.format("Checking %s status...",
agent.toString()));
- Answer answer = agentMgr.easySend(agent.getId(), cmd);
+ logger.debug("Checking {} status...", host.toString());
+ Answer answer = agentMgr.easySend(host.getId(), cmd);
if (answer != null) {
- hostStatus = answer.getResult() ? Status.Down : Status.Up;
- logger.debug(String.format("%s has the status [%s].",
agent.toString(), hostStatus));
-
- if ( hostStatus == Status.Up ){
- return true;
+ if (answer.getResult()) {
+ hostStatus = ((CheckOnHostAnswer)answer).isAlive() ?
Status.Up : Status.Down;
+ } else {
+ logger.debug("{} is not active according to itself,
details: {}.", host.toString(), answer.getDetails());
+ hostStatus = Status.Down;
}
- }
- else {
- logger.debug(String.format("Setting %s to \"Disconnected\"
status.", agent.toString()));
+ logger.debug("{} has the status [{}].", host.toString(),
hostStatus);
+ } else {
+ logger.debug("Setting {} to \"Disconnected\" status.",
host.toString());
hostStatus = Status.Disconnected;
}
} catch (Exception e) {
- logger.warn(String.format("Failed to send command
CheckOnHostCommand to %s.", agent.toString()), e);
+ logger.warn("Failed to send command CheckOnHostCommand to {}.",
host.toString(), e);
+ hostStatus = Status.Disconnected;
}
- List<HostVO> neighbors =
resourceManager.listHostsInClusterByStatus(agent.getClusterId(), Status.Up);
+ return hostStatus;
+ }
+
+ private Status checkHostStatusWithNeighbourHosts(Host host) {
+ Status hostStatusFromNeighbour = Status.Unknown;
+ boolean reportFailureIfOneStorageIsDown =
HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
+ final CheckOnHostCommand cmd = new CheckOnHostCommand(host,
reportFailureIfOneStorageIsDown);
+ List<HostVO> neighbors =
resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up);
for (HostVO neighbor : neighbors) {
- if (neighbor.getId() == agent.getId() ||
(neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM &&
neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
+ if (neighbor.getId() == host.getId()
+ || (neighbor.getHypervisorType() !=
Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() !=
Hypervisor.HypervisorType.LXC)) {
continue;
}
try {
- logger.debug(String.format("Investigating %s via neighbouring
%s.", agent.toString(), neighbor.toString()));
-
+ logger.debug("Investigating {} via neighboring {}.",
host.toString(), neighbor.toString());
Answer answer = agentMgr.easySend(neighbor.getId(), cmd);
if (answer != null) {
- neighbourStatus = answer.getResult() ? Status.Down :
Status.Up;
-
- logger.debug(String.format("Neighbouring %s returned
status [%s] for the investigated %s.", neighbor.toString(), neighbourStatus,
agent.toString()));
-
- if (neighbourStatus == Status.Up) {
- break;
+ if (answer.getResult()) {
+ hostStatusFromNeighbour =
((CheckOnHostAnswer)answer).isAlive() ? Status.Up : Status.Down;
+ logger.debug("Neighboring {} returned status [{}] for
the investigated {}.", neighbor.toString(), hostStatusFromNeighbour,
host.toString());
+ if (hostStatusFromNeighbour == Status.Up) {
+ return hostStatusFromNeighbour;
+ }
+ } else {
+ logger.debug("{} is not active according to neighbor
{}, details: {}.", host.toString(), neighbor.toString(), answer.getDetails());
}
} else {
- logger.debug(String.format("Neighbouring %s is
Disconnected.", neighbor.toString()));
+ logger.debug("Neighboring {} is Disconnected.",
neighbor.toString());
}
} catch (Exception e) {
- logger.warn(String.format("Failed to send command
CheckOnHostCommand to %s.", neighbor.toString()), e);
+ logger.warn("Failed to send command CheckOnHostCommand to
neighbor {}.", neighbor.toString(), e);
}
}
- if (neighbourStatus == Status.Up && (hostStatus == Status.Disconnected
|| hostStatus == Status.Down)) {
- hostStatus = Status.Disconnected;
- }
- if (neighbourStatus == Status.Down && (hostStatus ==
Status.Disconnected || hostStatus == Status.Down)) {
- hostStatus = Status.Down;
- }
- logger.debug(String.format("%s has the status [%s].",
agent.toString(), hostStatus));
-
- return hostStatus == Status.Up;
+ return hostStatusFromNeighbour;
}
- private boolean isVMActivityOnHost(Host agent, DateTime suspectTime)
throws HACheckerException {
- if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM &&
agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
- throw new IllegalStateException(String.format("Calling KVM
investigator for non KVM Host of type [%s].", agent.getHypervisorType()));
+ private boolean hasVMActivityOnHost(Host host, DateTime suspectTime)
throws HACheckerException {
+ if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM &&
host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
+ throw new IllegalStateException(String.format("Calling KVM
investigator for non KVM Host of type [%s].", host.getHypervisorType()));
}
boolean activityStatus = true;
- HashMap<StoragePool, List<Volume>> poolVolMap =
getVolumeUuidOnHost(agent);
- for (StoragePool pool : poolVolMap.keySet()) {
- activityStatus = verifyActivityOfStorageOnHost(poolVolMap, pool,
agent, suspectTime, activityStatus);
+ HashMap<StoragePool, List<Volume>> poolVolumeMap =
getStoragePoolAndVolumeInfoOnHost(host);
+ for (StoragePool pool : poolVolumeMap.keySet()) {
+ activityStatus = verifyActivityOfStorageOnHost(poolVolumeMap,
pool, host, suspectTime, activityStatus);
if (!activityStatus) {
- logger.warn("It seems that the storage pool [{}] does not have
activity on {}.", pool, agent);
+ logger.warn("It seems that the storage pool [{}] does not have
activity on {}.", pool, host);
break;
}
}
@@ -166,66 +200,64 @@ public class KVMHostActivityChecker extends AdapterBase
implements ActivityCheck
return activityStatus;
}
- protected boolean verifyActivityOfStorageOnHost(HashMap<StoragePool,
List<Volume>> poolVolMap, StoragePool pool, Host agent, DateTime suspectTime,
boolean activityStatus) throws HACheckerException, IllegalStateException {
+ protected boolean verifyActivityOfStorageOnHost(HashMap<StoragePool,
List<Volume>> poolVolMap, StoragePool pool, Host host, DateTime suspectTime,
boolean activityStatus) throws HACheckerException, IllegalStateException {
List<Volume> volume_list = poolVolMap.get(pool);
- final CheckVMActivityOnStoragePoolCommand cmd = new
CheckVMActivityOnStoragePoolCommand(agent, pool, volume_list, suspectTime);
+ final CheckVMActivityOnStoragePoolCommand cmd = new
CheckVMActivityOnStoragePoolCommand(host, pool, volume_list, suspectTime);
- logger.debug("Checking VM activity for {} on storage pool [{}].",
agent.toString(), pool);
+ logger.debug("Checking VM activity for {} on storage pool [{}].",
host.toString(), pool);
try {
- Answer answer = storageManager.sendToPool(pool,
getNeighbors(agent), cmd);
-
+ Answer answer = storageManager.sendToPool(pool,
getNeighbors(host), cmd);
if (answer != null) {
activityStatus = !answer.getResult();
- logger.debug("{} {} activity on storage pool [{}]",
agent.toString(), activityStatus ? "has" : "does not have", pool);
+ logger.debug("{} {} activity on storage pool [{}]",
host.toString(), activityStatus ? "has" : "does not have", pool);
} else {
- String message = String.format("Did not get a valid response
for VM activity check for %s on storage pool [%s].", agent.toString(), pool);
+ String message = String.format("Did not get a valid response
for VM activity check for %s on storage pool [%s].", host.toString(), pool);
logger.debug(message);
throw new IllegalStateException(message);
}
- } catch (StorageUnavailableException e){
- String message = String.format("Storage [%s] is unavailable to do
the check, probably the %s is not reachable.", pool, agent);
+ } catch (StorageUnavailableException e) {
+ String message = String.format("Storage [%s] is unavailable to do
the check, probably the %s is not reachable.", pool, host);
logger.warn(message, e);
throw new HACheckerException(message, e);
}
return activityStatus;
}
- private HashMap<StoragePool, List<Volume>> getVolumeUuidOnHost(Host agent)
{
- List<VMInstanceVO> vm_list = vmInstanceDao.listByHostId(agent.getId());
- List<VolumeVO> volume_list = new ArrayList<VolumeVO>();
- for (VirtualMachine vm : vm_list) {
+ private HashMap<StoragePool, List<Volume>>
getStoragePoolAndVolumeInfoOnHost(Host host) {
+ List<VMInstanceVO> vmListOnHost =
vmInstanceDao.listByHostId(host.getId());
+ List<VolumeVO> volumeListOnHost = new ArrayList<>();
+ for (VirtualMachine vm : vmListOnHost) {
logger.debug("Retrieving volumes of VM [{}]...", vm);
- List<VolumeVO> vm_volume_list =
volumeDao.findByInstance(vm.getId());
- volume_list.addAll(vm_volume_list);
+ List<VolumeVO> volumeListOfVM =
volumeDao.findByInstance(vm.getId());
+ volumeListOnHost.addAll(volumeListOfVM);
}
- HashMap<StoragePool, List<Volume>> poolVolMap = new
HashMap<StoragePool, List<Volume>>();
- for (Volume vol : volume_list) {
- StoragePool sp = storagePool.findById(vol.getPoolId());
- logger.debug("Retrieving storage pool [{}] of volume [{}]...", sp,
vol);
- if (!poolVolMap.containsKey(sp)) {
- List<Volume> list = new ArrayList<Volume>();
- list.add(vol);
+ HashMap<StoragePool, List<Volume>> poolVolumeMap = new HashMap<>();
+ for (Volume volume : volumeListOnHost) {
+ StoragePool pool = storagePoolDao.findById(volume.getPoolId());
+ logger.debug("Retrieving storage pool [{}] of volume [{}]...",
pool, volume);
+ if (!poolVolumeMap.containsKey(pool)) {
+ List<Volume> volList = new ArrayList<>();
+ volList.add(volume);
- poolVolMap.put(sp, list);
+ poolVolumeMap.put(pool, volList);
} else {
- poolVolMap.get(sp).add(vol);
+ poolVolumeMap.get(pool).add(volume);
}
}
- return poolVolMap;
+ return poolVolumeMap;
}
- public long[] getNeighbors(Host agent) {
- List<Long> neighbors = new ArrayList<Long>();
- List<HostVO> cluster_hosts =
resourceManager.listHostsInClusterByStatus(agent.getClusterId(), Status.Up);
- logger.debug("Retrieving all \"Up\" hosts from cluster [{}]...",
clusterDao.findById(agent.getClusterId()));
- for (HostVO host : cluster_hosts) {
- if (host.getId() == agent.getId() || (host.getHypervisorType() !=
Hypervisor.HypervisorType.KVM && host.getHypervisorType() !=
Hypervisor.HypervisorType.LXC)) {
+ public long[] getNeighbors(Host host) {
+ List<Long> neighbors = new ArrayList<>();
+ List<HostVO> clusterHosts =
resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up);
+ logger.debug("Retrieving all \"Up\" hosts from cluster [{}]...",
clusterDao.findById(host.getClusterId()));
+ for (HostVO clusterHost : clusterHosts) {
+ if (clusterHost.getId() == host.getId() ||
(clusterHost.getHypervisorType() != Hypervisor.HypervisorType.KVM &&
clusterHost.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
continue;
}
- neighbors.add(host.getId());
+ neighbors.add(clusterHost.getId());
}
return ArrayUtils.toPrimitive(neighbors.toArray(new
Long[neighbors.size()]));
}
-
}
diff --git
a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java
b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java
index 0fae9807243..662b09a3044 100644
---
a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java
+++
b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java
@@ -3130,7 +3130,7 @@ public class LibvirtComputingResourceTest {
assertNotNull(wrapper);
final Answer answer = wrapper.execute(command,
libvirtComputingResourceMock);
- assertTrue(answer.getResult());
+ assertFalse(answer.getResult());
verify(libvirtComputingResourceMock, times(1)).getMonitor();
}
diff --git
a/plugins/hypervisors/simulator/src/main/java/com/cloud/ha/SimulatorInvestigator.java
b/plugins/hypervisors/simulator/src/main/java/com/cloud/ha/SimulatorInvestigator.java
index 7114a841157..95a6a1291cf 100644
---
a/plugins/hypervisors/simulator/src/main/java/com/cloud/ha/SimulatorInvestigator.java
+++
b/plugins/hypervisors/simulator/src/main/java/com/cloud/ha/SimulatorInvestigator.java
@@ -54,13 +54,13 @@ public class SimulatorInvestigator extends AdapterBase
implements Investigator {
}
@Override
- public Status isAgentAlive(Host agent) {
+ public Status getHostAgentStatus(Host agent) {
if (agent.getHypervisorType() != HypervisorType.Simulator) {
return null;
}
if (haManager.isHAEligible(agent)) {
- return haManager.getHostStatus(agent);
+ return haManager.getHostStatusFromHAConfig(agent);
}
CheckOnHostCommand cmd = new CheckOnHostCommand(agent);
diff --git
a/plugins/hypervisors/vmware/src/main/java/com/cloud/ha/VmwareInvestigator.java
b/plugins/hypervisors/vmware/src/main/java/com/cloud/ha/VmwareInvestigator.java
index 5bfc1896843..3d4fab0d229 100644
---
a/plugins/hypervisors/vmware/src/main/java/com/cloud/ha/VmwareInvestigator.java
+++
b/plugins/hypervisors/vmware/src/main/java/com/cloud/ha/VmwareInvestigator.java
@@ -28,7 +28,7 @@ public class VmwareInvestigator extends AdapterBase
implements Investigator {
}
@Override
- public Status isAgentAlive(Host agent) {
+ public Status getHostAgentStatus(Host agent) {
if (agent.getHypervisorType() == HypervisorType.VMware)
return Status.Disconnected;
diff --git
a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStoragePool.java
b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStoragePool.java
index bb354bec9b4..1bcfaa4ebf7 100644
---
a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStoragePool.java
+++
b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStoragePool.java
@@ -228,11 +228,11 @@ public class LinstorStoragePool implements KVMStoragePool
{
public String createHeartBeatCommand(HAStoragePool pool, String
hostPrivateIp,
boolean hostValidation) {
LOGGER.trace(String.format("Linstor.createHeartBeatCommand: %s, %s,
%b", pool.getPoolIp(), hostPrivateIp, hostValidation));
- boolean isStorageNodeUp = checkingHeartBeat(pool, null);
+ boolean isStorageNodeUp = hasHeartBeat(pool, null);
if (!isStorageNodeUp && !hostValidation) {
//restart the host
LOGGER.debug(String.format("The host [%s] will be restarted
because the health check failed for the storage pool [%s]", hostPrivateIp,
pool.getPool().getType()));
- Script cmd = new Script(pool.getPool().getHearthBeatPath(),
Duration.millis(HeartBeatUpdateTimeout), LOGGER);
+ Script cmd = new Script(pool.getPool().getHearthBeatPath(),
Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER);
cmd.add("-c");
cmd.execute();
return "Down";
@@ -258,7 +258,7 @@ public class LinstorStoragePool implements KVMStoragePool {
}
@Override
- public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
+ public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
String hostName;
if (host == null) {
hostName = localNodeName;
@@ -274,7 +274,7 @@ public class LinstorStoragePool implements KVMStoragePool {
}
private String executeDrbdSetupStatus(OutputInterpreter.AllLinesParser
parser) {
- Script sc = new Script("drbdsetup",
Duration.millis(HeartBeatUpdateTimeout), LOGGER);
+ Script sc = new Script("drbdsetup",
Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER);
sc.add("status");
sc.add("--json");
return sc.execute(parser);
@@ -329,7 +329,7 @@ public class LinstorStoragePool implements KVMStoragePool {
}
private String executeDrbdEventsNow(OutputInterpreter.AllLinesParser
parser) {
- Script sc = new Script("drbdsetup",
Duration.millis(HeartBeatUpdateTimeout), LOGGER);
+ Script sc = new Script("drbdsetup",
Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER);
sc.add("events2");
sc.add("--now");
return sc.execute(parser);
@@ -369,8 +369,8 @@ public class LinstorStoragePool implements KVMStoragePool {
}
@Override
- public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath,
long duration) {
+ public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath,
long duration) {
LOGGER.trace(String.format("Linstor.vmActivityCheck: %s, %s",
pool.getPoolIp(), host.getPrivateNetwork().getIp()));
- return checkingHeartBeat(pool, host);
+ return hasHeartBeat(pool, host);
}
}
diff --git
a/plugins/storage/volume/storpool/src/main/java/com/cloud/hypervisor/kvm/storage/StorPoolStoragePool.java
b/plugins/storage/volume/storpool/src/main/java/com/cloud/hypervisor/kvm/storage/StorPoolStoragePool.java
index ab5dc03d343..04100a3c6d3 100644
---
a/plugins/storage/volume/storpool/src/main/java/com/cloud/hypervisor/kvm/storage/StorPoolStoragePool.java
+++
b/plugins/storage/volume/storpool/src/main/java/com/cloud/hypervisor/kvm/storage/StorPoolStoragePool.java
@@ -198,11 +198,11 @@ public class StorPoolStoragePool implements
KVMStoragePool {
@Override
public String createHeartBeatCommand(HAStoragePool primaryStoragePool,
String hostPrivateIp, boolean hostValidation) {
- boolean isStorageNodeUp = checkingHeartBeat(primaryStoragePool, null);
+ boolean isStorageNodeUp = hasHeartBeat(primaryStoragePool, null);
if (!isStorageNodeUp && !hostValidation) {
//restart the host
logger.debug(String.format("The host [%s] will be restarted
because the health check failed for the storage pool [%s]", hostPrivateIp,
primaryStoragePool.getPool().getType()));
- Script cmd = new
Script(primaryStoragePool.getPool().getHearthBeatPath(),
HeartBeatUpdateTimeout, logger);
+ Script cmd = new
Script(primaryStoragePool.getPool().getHearthBeatPath(),
HeartBeatUpdateTimeoutInMs, logger);
cmd.add("-c");
cmd.execute();
return "Down";
@@ -240,7 +240,7 @@ public class StorPoolStoragePool implements KVMStoragePool {
}
@Override
- public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) {
+ public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) {
boolean isNodeWorking = false;
OutputInterpreter.AllLinesParser parser = new
OutputInterpreter.AllLinesParser();
@@ -300,8 +300,8 @@ public class StorPoolStoragePool implements KVMStoragePool {
}
@Override
- public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUuidListString, String vmActivityCheckPath,
long duration) {
- return checkingHeartBeat(pool, host);
+ public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration
activityScriptTimeout, String volumeUuidListString, String vmActivityCheckPath,
long duration) {
+ return hasHeartBeat(pool, host);
}
@Override
diff --git a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh
b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh
index 9b7eadada69..1fa49b80776 100755
--- a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh
+++ b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh
@@ -75,7 +75,7 @@ fi
#delete VMs on this mountpoint
deleteVMs() {
local mountPoint=$1
- vmPids=$(ps aux| grep qemu | grep "$mountPoint" | awk '{print $2}' 2>
/dev/null)
+ vmPids=$(ps aux | grep qemu | grep "$mountPoint" | awk '{print $2}' 2>
/dev/null)
if [ $? -gt 0 ]
then
return
@@ -93,7 +93,7 @@ deleteVMs() {
}
#checking is there the same nfs server mounted under $MountPoint?
-mounts=$(cat /proc/mounts |grep nfs|grep $MountPoint)
+mounts=$(cat /proc/mounts | grep nfs | grep $MountPoint)
if [ $? -gt 0 ]
then
# remount it
diff --git a/server/src/main/java/com/cloud/ha/CheckOnAgentInvestigator.java
b/server/src/main/java/com/cloud/ha/CheckOnAgentInvestigator.java
index d7945ef2077..e0dbc8fcbe1 100644
--- a/server/src/main/java/com/cloud/ha/CheckOnAgentInvestigator.java
+++ b/server/src/main/java/com/cloud/ha/CheckOnAgentInvestigator.java
@@ -38,7 +38,7 @@ public class CheckOnAgentInvestigator extends AdapterBase
implements Investigato
}
@Override
- public Status isAgentAlive(Host agent) {
+ public Status getHostAgentStatus(Host agent) {
return null;
}
diff --git a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
index c8635397b66..755de00dec2 100644
--- a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
+++ b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
@@ -42,6 +42,9 @@ import
org.apache.cloudstack.engine.subsystem.api.storage.PrimaryDataStoreDriver
import org.apache.cloudstack.framework.config.ConfigKey;
import org.apache.cloudstack.framework.config.Configurable;
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
+import org.apache.cloudstack.ha.HAConfig;
+import org.apache.cloudstack.ha.HAResource;
+import org.apache.cloudstack.ha.dao.HAConfigDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
@@ -223,6 +226,8 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
@Inject
ConfigurationDao _configDao;
@Inject
+ HAConfigDao _haConfigDao;
+ @Inject
VolumeOrchestrationService volumeMgr;
String _instance;
@@ -237,25 +242,53 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
long _timeBetweenCleanups;
String _haTag = null;
+ protected HighAvailabilityManagerImpl() {
+ }
+
private boolean vmHasPendingHAJob(final List<HaWorkVO> pendingHaWorks,
final VMInstanceVO vm) {
Optional<HaWorkVO> item = pendingHaWorks.stream()
.filter(h -> h.getInstanceId() == vm.getId())
.reduce((first, second) -> second);
if (item.isPresent() && (item.get().getTimesTried() < _maxRetries ||
!item.get().canScheduleNew(_timeBetweenFailures))) {
- logger.debug(String.format("Skipping HA on %s as there is already
a running HA job for it", vm));
+ logger.debug("Skipping HA on {} as there is already a running HA
job for it", vm);
return true;
}
return false;
}
- protected HighAvailabilityManagerImpl() {
+ private boolean isHostHAInspectionInProgress(long hostId) {
+ final HAConfig haConfig = _haConfigDao.findHAResource(hostId,
HAResource.ResourceType.Host);
+ if (haConfig == null || !haConfig.isEnabled()) {
+ return false;
+ }
+
+ HAConfig.HAState state = haConfig.getState();
+ logger.debug("Checking Host HA inspection is in progress or not for
the host {} from HAConfig, HA state is {}", hostId, state);
+ if (state == HAConfig.HAState.Suspect || state ==
HAConfig.HAState.Checking) {
+ return true;
+ }
+
+ if (state == HAConfig.HAState.Recovered || state ==
HAConfig.HAState.Available) {
+ // If the host HA state is Recovered, it indicates that the host
has restarted successfully.
+ // If the host HA state is Available, it means the host has
restarted successfully and the recovery waiting period has completed.
+ // In both states, the agent can connect as soon as the host is
ready (and can move to Suspect -> Checking HA state if the agent connection
fails again before Fencing).
+ final HostVO host = _hostDao.findById(hostId);
+ if (host != null && host.getStatus() != Status.Up) {
+ logger.debug("{} is in {} status and HA state is {},
considering Host HA inspection is still in progress" +
+ " until we are sure the host is ready after a recovery
wait period and agent is connected/Up", host, host.getStatus(), state);
+ return true;
+ }
+ }
+
+ return false;
}
@Override
public Status investigate(final long hostId) {
final HostVO host = _hostDao.findById(hostId);
if (host == null) {
+ logger.warn("Host with id {} is removed or doesn't exists.",
hostId);
return Status.Alert;
}
@@ -270,7 +303,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
Status hostState = null;
for (Investigator investigator : investigators) {
- hostState = investigator.isAgentAlive(host);
+ hostState = investigator.getHostAgentStatus(host);
if (hostState != null) {
if (logger.isDebugEnabled()) {
logger.debug("{} was able to determine host {} is in {}",
investigator.getName(), host, hostState.toString());
@@ -278,7 +311,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
return hostState;
}
if (logger.isDebugEnabled()) {
- logger.debug(investigator.getName() + " unable to determine
the state of the host. Moving on.");
+ logger.debug("{} unable to determine the state of the host.
Moving on.", investigator.getName());
}
}
@@ -570,9 +603,9 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
protected Long restart(final HaWorkVO work) {
- logger.debug("RESTART with HAWORK");
+ logger.debug("RESTART with HA WORK");
List<HaWorkVO> items =
_haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId());
- if (items.size() > 0) {
+ if (!items.isEmpty()) {
StringBuilder str = new StringBuilder("Cancelling this work item
because newer ones have been scheduled. Work Ids = [");
for (HaWorkVO item : items) {
str.append(item.getId()).append(", ");
@@ -583,7 +616,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
items = _haDao.listRunningHaWorkForVm(work.getInstanceId());
- if (items.size() > 0) {
+ if (!items.isEmpty()) {
StringBuilder str = new StringBuilder("Waiting because there's HA
work being executed on an item currently. Work Ids =[");
for (HaWorkVO item : items) {
str.append(item.getId()).append(", ");
@@ -597,21 +630,21 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
VirtualMachine vm = _itMgr.findById(work.getInstanceId());
if (vm == null) {
- logger.info("Unable to find vm: " + vmId);
+ logger.info("Unable to find vm: {}", vmId);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;
}
- logger.info("HA on " + vm);
+ logger.info("HA on {}", vm);
if (vm.getState() != work.getPreviousState() || vm.getUpdated() !=
work.getUpdateTime()) {
- logger.info("VM " + vm + " has been changed. Current State = " +
vm.getState() + " Previous State = " + work.getPreviousState() + " last updated
= " +
- vm.getUpdated() + " previous updated = " +
work.getUpdateTime());
+ logger.info("VM {} has been changed. Current State = {} Previous
State = {} last updated = {} previous updated = {}",
+ vm, vm.getState(), work.getPreviousState(),
vm.getUpdated(), work.getUpdateTime());
return null;
}
if (vm.getHostId() != null &&
!vm.getHostId().equals(work.getHostId())) {
- logger.info("VM " + vm + " has been changed. Current host id = "
+ vm.getHostId() + " Previous host id = " + work.getHostId());
+ logger.info("VM {} has been changed. Current host id = {}
Previous host id = {}", vm, vm.getHostId(), work.getHostId());
return null;
}
@@ -628,10 +661,13 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
boolean isHostRemoved = false;
if (host == null) {
host = _hostDao.findByIdIncludingRemoved(work.getHostId());
- if (host != null) {
- logger.debug("VM {} is now no longer on host {} as the host is
removed", vm, host);
- isHostRemoved = true;
+ if (host == null) {
+ logger.debug("VM {} is now no longer on host {}, the host
doesn't exist", vm, work.getHostId());
+ return null;
}
+
+ logger.debug("VM {} is now no longer on host {} as the host is
removed", vm, host);
+ isHostRemoved = true;
}
DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
@@ -652,40 +688,39 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
try
{
alive = investigator.isVmAlive(vm, host);
- logger.info(investigator.getName() + " found " + vm +
" to be alive? " + alive);
+ logger.info("{} found {} to be alive? {}",
investigator.getName(), vm, alive);
break;
} catch (UnknownVM e) {
- logger.info(investigator.getName() + " could not find
" + vm);
+ logger.info("{} could not find {}",
investigator.getName(), vm);
}
}
boolean fenced = false;
if (alive == null) {
- logger.debug("Fencing off VM that we don't know the state
of");
+ logger.debug("Fencing off VM {} that we don't know the
state of", vm);
for (FenceBuilder fb : fenceBuilders) {
Boolean result = fb.fenceOff(vm, host);
- logger.info("Fencer " + fb.getName() + " returned " +
result);
+ logger.info("Fencer {} returned {}", fb.getName(),
result);
if (result != null && result) {
fenced = true;
break;
}
}
-
} else if (!alive) {
fenced = true;
} else {
- logger.debug("VM {} is found to be alive by {}", vm,
investigator.getName());
+ logger.debug("VM {} is found to be alive by {} on host
{}", vm, investigator.getName(), host);
if (host.getStatus() == Status.Up) {
- logger.info(vm + " is alive and host is up. No need to
restart it.");
+ logger.info("{} is alive and host {} is up. No need to
restart it.", vm, host);
return null;
} else {
- logger.debug("Rescheduling because the host is not up
but the vm is alive");
+ logger.debug("Rescheduling because the host {} is not
up but the vm {} is alive", host, vm);
return (System.currentTimeMillis() >> 10) +
_investigateRetryInterval;
}
}
if (!fenced) {
- logger.debug("We were unable to fence off the VM " + vm);
+ logger.debug("We were unable to fence off the VM {}", vm);
_alertMgr.sendAlert(alertType, vm.getDataCenterId(),
vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() +
" which was running on host " + hostDesc,
"Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " +
vmId +
" which was running on host " + hostDesc);
@@ -728,15 +763,15 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
if (!ForceHA.value() && !vm.isHaEnabled()) {
if (logger.isDebugEnabled()) {
- logger.debug("VM is not HA enabled so we're done.");
+ logger.debug("VM {} is not HA enabled so we're done.", vm);
}
return null; // VM doesn't require HA
}
- if ((host == null || host.getRemoved() != null || host.getState() !=
Status.Up)
+ if ((host.getRemoved() != null || host.getState() != Status.Up)
&& !volumeMgr.canVmRestartOnAnotherServer(vm.getId())) {
if (logger.isDebugEnabled()) {
- logger.debug("VM can not restart on another server.");
+ logger.debug("VM {} can not restart on another server.", vm);
}
return null;
}
@@ -777,13 +812,13 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
if (started != null && started.getState() ==
VirtualMachine.State.Running) {
String message = String.format("HA starting VM: %s (%s)",
started.getHostName(), started.getInstanceName());
HostVO hostVmHasStarted =
_hostDao.findById(started.getHostId());
- logger.info(String.format("HA is now restarting %s on %s",
started, hostVmHasStarted));
+ logger.info("HA is now restarting {} on {}", started,
hostVmHasStarted);
_alertMgr.sendAlert(alertType, vm.getDataCenterId(),
vm.getPodIdToDeployIn(), message, message);
return null;
}
if (logger.isDebugEnabled()) {
- logger.debug("Rescheduling VM " + vm.toString() + " to try
again in " + _restartRetryInterval);
+ logger.debug("Rescheduling VM {} to try again in {}",
vm.toString(), _restartRetryInterval);
}
} catch (final InsufficientCapacityException e) {
logger.warn("Unable to restart " + vm.toString() + " due to " +
e.getMessage());
@@ -815,6 +850,9 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
if (!CancellableWorkReasonTypes.contains(work.getReasonType())) {
return false;
}
+ if (isHostHAInspectionInProgress(work.getHostId())) {
+ return false;
+ }
Status hostStatus = investigate(work.getHostId());
if (!Status.Up.equals(hostStatus)) {
return false;
@@ -825,13 +863,14 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
public Long migrate(final HaWorkVO work) {
+ logger.debug("MIGRATE with HA WORK");
long vmId = work.getInstanceId();
long srcHostId = work.getHostId();
HostVO srcHost = _hostDao.findById(srcHostId);
VMInstanceVO vm = _instanceDao.findById(vmId);
if (vm == null) {
- logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
+ logger.info("Unable to find vm: {}, skipping migrate.", vmId);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
@@ -840,11 +879,11 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
logger.info("Migration attempt: for {} from {}. Starting attempt:
{}/{} times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries);
if (VirtualMachine.State.Stopped.equals(vm.getState())) {
- logger.info(String.format("vm %s is Stopped, skipping migrate.",
vm));
+ logger.info("vm {} is Stopped, skipping migrate.", vm);
return null;
}
if (VirtualMachine.State.Running.equals(vm.getState()) && srcHostId !=
vm.getHostId()) {
- logger.info(String.format("VM %s is running on a different host
%s, skipping migration", vm, vm.getHostId()));
+ logger.info("VM {} is running on a different host {}, skipping
migration", vm, vm.getHostId());
return null;
}
@@ -879,7 +918,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(),
WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(),
reasonType);
_haDao.persist(work);
if (logger.isDebugEnabled()) {
- logger.debug("Scheduled " + work.toString());
+ logger.debug("{}}", work.toString());
}
wakeupWorkers();
return true;
@@ -897,7 +936,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
private void destroyVM(VirtualMachine vm, boolean expunge) throws
OperationTimedoutException, AgentUnavailableException {
- logger.info("Destroying " + vm.toString());
+ logger.info("Destroying {}", vm.toString());
if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
consoleProxyManager.destroyProxy(vm.getId());
} else if
(VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
@@ -908,9 +947,10 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
protected Long destroyVM(final HaWorkVO work) {
+ logger.debug("DESTROY with HA WORK");
final VirtualMachine vm = _itMgr.findById(work.getInstanceId());
if (vm == null) {
- logger.info("No longer can find VM " + work.getInstanceId() + ".
Throwing away " + work);
+ logger.info("No longer can find VM {}. Throwing away {}",
work.getInstanceId(), work);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
@@ -944,20 +984,21 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
protected Long stopVM(final HaWorkVO work) throws
ConcurrentOperationException {
+ logger.debug("STOP with HA WORK");
VirtualMachine vm = _itMgr.findById(work.getInstanceId());
if (vm == null) {
- logger.info("No longer can find VM " + work.getInstanceId() + ".
Throwing away " + work);
+ logger.info("No longer can find VM {}. Throwing away {}",
work.getInstanceId(), work);
work.setStep(Step.Done);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;
}
- logger.info("Stopping " + vm);
+ logger.info("Stopping {}", vm);
try {
if (work.getWorkType() == WorkType.Stop) {
_itMgr.advanceStop(vm.getUuid(), false);
- logger.info("Successfully stopped " + vm);
+ logger.info("Successfully stopped {}", vm);
return null;
} else if (work.getWorkType() == WorkType.CheckStop) {
if ((vm.getState() != work.getPreviousState()) ||
vm.getUpdated() != work.getUpdateTime() || vm.getHostId() == null ||
@@ -969,7 +1010,7 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
_itMgr.advanceStop(vm.getUuid(), false);
- logger.info("Stop for " + vm + " was successful");
+ logger.info("Stop for {} was successful", vm);
return null;
} else if (work.getWorkType() == WorkType.ForceStop) {
if ((vm.getState() != work.getPreviousState()) ||
vm.getUpdated() != work.getUpdateTime() || vm.getHostId() == null ||
@@ -981,13 +1022,13 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
_itMgr.advanceStop(vm.getUuid(), true);
- logger.info("Stop for " + vm + " was successful");
+ logger.info("Stop for {} was successful", vm);
return null;
} else {
assert false : "Who decided there's other steps but didn't
modify the guy who does the work?";
}
} catch (final ResourceUnavailableException e) {
- logger.debug("Agnet is not available" + e.getMessage());
+ logger.debug("Agent is not available" + e.getMessage());
} catch (OperationTimedoutException e) {
logger.debug("operation timed out: " + e.getMessage());
}
@@ -1043,7 +1084,8 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
try {
if (vm != null && !VmHaEnabled.valueIn(vm.getDataCenterId())) {
if (logger.isDebugEnabled()) {
- logger.debug(String.format("VM high availability manager
is disabled, rescheduling the HA work %s, for the VM %s (id) to retry later in
case VM high availability manager is enabled on retry attempt", work,
vm.getName(), vm.getId()));
+ logger.debug("VM high availability manager is disabled,
rescheduling the HA work {} for the VM {} ({}) " +
+ "to retry later in case VM high availability
manager is enabled on retry attempt", work, vm.getName(), vm.getId());
}
long nextTime = getRescheduleTime(wt);
rescheduleWork(work, nextTime);
@@ -1065,13 +1107,13 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
}
if (nextTime == null) {
- logger.info("Completed work " + work + ". Took " +
(work.getTimesTried() + 1) + "/" + _maxRetries + " attempts.");
+ logger.info("Completed work {}. Took {}/{} attempts.", work,
work.getTimesTried() + 1, _maxRetries);
work.setStep(Step.Done);
} else {
rescheduleWork(work, nextTime.longValue());
}
} catch (Exception e) {
- logger.warn("Encountered unhandled exception during HA process,
reschedule work", e);
+ logger.warn("Encountered unhandled exception during HA process,
reschedule work {}", work, e);
long nextTime = getRescheduleTime(wt);
rescheduleWork(work, nextTime);
@@ -1085,11 +1127,11 @@ public class HighAvailabilityManagerImpl extends
ManagerBase implements Configur
} finally {
if (!Step.Done.equals(work.getStep())) {
if (work.getTimesTried() >= _maxRetries) {
- logger.warn("Giving up, retried max " +
work.getTimesTried() + "/" + _maxRetries + " times for work: " + work);
+ logger.warn("Giving up, retried max {}/{} times for work:
{}", work.getTimesTried(), _maxRetries, work);
work.setStep(Step.Done);
} else {
- logger.warn("Rescheduling work " + work + " to try again
at " + new Date(work.getTimeToTry() << 10) +
- ". Finished attempt " + work.getTimesTried() + "/"
+ _maxRetries + " times.");
+ logger.warn("Rescheduling work {} to try again at {}.
Finished attempt {}/{} times.",
+ work, new Date(work.getTimeToTry() << 10),
work.getTimesTried(), _maxRetries);
}
}
_haDao.update(work.getId(), work);
diff --git a/server/src/main/java/com/cloud/ha/KVMFencer.java
b/server/src/main/java/com/cloud/ha/KVMFencer.java
index b51ed00b028..4a6606b09cc 100644
--- a/server/src/main/java/com/cloud/ha/KVMFencer.java
+++ b/server/src/main/java/com/cloud/ha/KVMFencer.java
@@ -74,7 +74,7 @@ public class KVMFencer extends AdapterBase implements
FenceBuilder {
@Override
public Boolean fenceOff(VirtualMachine vm, Host host) {
if (host.getHypervisorType() != HypervisorType.KVM &&
host.getHypervisorType() != HypervisorType.LXC) {
- logger.warn("Don't know how to fence non kvm hosts " +
host.getHypervisorType());
+ logger.warn("Don't know how to fence non kvm hosts {}",
host.getHypervisorType());
return null;
}
@@ -97,11 +97,8 @@ public class KVMFencer extends AdapterBase implements
FenceBuilder {
FenceAnswer answer;
try {
answer = (FenceAnswer)_agentMgr.send(h.getId(), fence);
- } catch (AgentUnavailableException e) {
- logger.info("Moving on to the next host because " +
h.toString() + " is unavailable", e);
- continue;
- } catch (OperationTimedoutException e) {
- logger.info("Moving on to the next host because " +
h.toString() + " is unavailable", e);
+ } catch (AgentUnavailableException |
OperationTimedoutException e) {
+ logger.info("Moving on to the next host because {} is
unavailable", h.toString(), e);
continue;
}
if (answer != null && answer.getResult()) {
@@ -115,7 +112,7 @@ public class KVMFencer extends AdapterBase implements
FenceBuilder {
"Fencing off host " + host.getId() + " did not
succeed after asking " + i + " hosts. " +
"Check Agent logs for more information.");
- logger.error("Unable to fence off " + vm.toString() + " on " +
host.toString());
+ logger.error("Unable to fence off {} on {}", vm.toString(),
host.toString());
return false;
}
diff --git
a/server/src/main/java/com/cloud/ha/ManagementIPSystemVMInvestigator.java
b/server/src/main/java/com/cloud/ha/ManagementIPSystemVMInvestigator.java
index 0972f2451af..d14c4baafb3 100644
--- a/server/src/main/java/com/cloud/ha/ManagementIPSystemVMInvestigator.java
+++ b/server/src/main/java/com/cloud/ha/ManagementIPSystemVMInvestigator.java
@@ -104,7 +104,7 @@ public class ManagementIPSystemVMInvestigator extends
AbstractInvestigatorImpl {
}
@Override
- public Status isAgentAlive(Host agent) {
+ public Status getHostAgentStatus(Host agent) {
return null;
}
diff --git a/server/src/main/java/com/cloud/ha/UserVmDomRInvestigator.java
b/server/src/main/java/com/cloud/ha/UserVmDomRInvestigator.java
index 7d063b3088e..82074460f2a 100644
--- a/server/src/main/java/com/cloud/ha/UserVmDomRInvestigator.java
+++ b/server/src/main/java/com/cloud/ha/UserVmDomRInvestigator.java
@@ -103,7 +103,7 @@ public class UserVmDomRInvestigator extends
AbstractInvestigatorImpl {
}
@Override
- public Status isAgentAlive(Host agent) {
+ public Status getHostAgentStatus(Host agent) {
if (logger.isDebugEnabled()) {
logger.debug("checking if agent ({}) is alive", agent);
}
diff --git a/server/src/main/java/com/cloud/ha/XenServerInvestigator.java
b/server/src/main/java/com/cloud/ha/XenServerInvestigator.java
index 5482a7f148e..fea44c97ab5 100644
--- a/server/src/main/java/com/cloud/ha/XenServerInvestigator.java
+++ b/server/src/main/java/com/cloud/ha/XenServerInvestigator.java
@@ -46,7 +46,7 @@ public class XenServerInvestigator extends AdapterBase
implements Investigator {
}
@Override
- public Status isAgentAlive(Host agent) {
+ public Status getHostAgentStatus(Host agent) {
if (agent.getHypervisorType() != HypervisorType.XenServer) {
return null;
}
@@ -74,7 +74,7 @@ public class XenServerInvestigator extends AdapterBase
implements Investigator {
@Override
public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM {
- Status status = isAgentAlive(host);
+ Status status = getHostAgentStatus(host);
if (status == null) {
throw new UnknownVM();
}
diff --git a/server/src/main/java/org/apache/cloudstack/ha/HAManager.java
b/server/src/main/java/org/apache/cloudstack/ha/HAManager.java
index 8282c621c1e..068230c6673 100644
--- a/server/src/main/java/org/apache/cloudstack/ha/HAManager.java
+++ b/server/src/main/java/org/apache/cloudstack/ha/HAManager.java
@@ -67,11 +67,16 @@ public interface HAManager extends HAConfigManager {
"The number of pending fence operations per management server.
This setting determines the size of the size of the FENCE queue.", true);
boolean transitionHAState(final HAConfig.Event event, final HAConfig
haConfig);
+
HAProvider getHAProvider(final String name);
+
HAResourceCounter getHACounter(final Long resourceId, final
HAResource.ResourceType resourceType);
+
void purgeHACounter(final Long resourceId, final HAResource.ResourceType
resourceType);
boolean isHAEligible(final HAResource resource);
+
Boolean isVMAliveOnHost(final Host host) throws Investigator.UnknownVM;
- Status getHostStatus(final Host host);
+
+ Status getHostStatusFromHAConfig(final Host host);
}
diff --git a/server/src/main/java/org/apache/cloudstack/ha/HAManagerImpl.java
b/server/src/main/java/org/apache/cloudstack/ha/HAManagerImpl.java
index a016be5c6e3..93b237ec20a 100644
--- a/server/src/main/java/org/apache/cloudstack/ha/HAManagerImpl.java
+++ b/server/src/main/java/org/apache/cloudstack/ha/HAManagerImpl.java
@@ -139,9 +139,7 @@ public final class HAManagerImpl extends ManagerBase
implements HAManager, Clust
public synchronized void purgeHACounter(final Long resourceId, final
HAResource.ResourceType resourceType) {
final String key = resourceCounterKey(resourceId, resourceType);
- if (haCounterMap.containsKey(key)) {
- haCounterMap.remove(key);
- }
+ haCounterMap.remove(key);
}
public boolean transitionHAState(final HAConfig.Event event, final
HAConfig haConfig) {
@@ -248,6 +246,7 @@ public final class HAManagerImpl extends ManagerBase
implements HAManager, Clust
}
private boolean isHAEnabledForCluster(final HAResource resource) {
+ // HA is enabled by default when cluster details doesn't exist
if (resource == null || resource.getClusterId() == null) {
return true;
}
@@ -259,14 +258,10 @@ public final class HAManagerImpl extends ManagerBase
implements HAManager, Clust
if (resource == null || resource.getId() < 1L) {
return false;
}
- HAResource.ResourceType resourceType = null;
- if (resource instanceof Host) {
- resourceType = HAResource.ResourceType.Host;
- }
- if (resourceType == null) {
+ if (!(resource instanceof Host)) {
return false;
}
- final HAConfig haConfig = haConfigDao.findHAResource(resource.getId(),
resourceType);
+ final HAConfig haConfig = haConfigDao.findHAResource(resource.getId(),
HAResource.ResourceType.Host);
return haConfig != null && haConfig.isEnabled()
&& haConfig.getState() != HAConfig.HAState.Disabled
&& haConfig.getState() != HAConfig.HAState.Ineligible;
@@ -317,19 +312,23 @@ public final class HAManagerImpl extends ManagerBase
implements HAManager, Clust
throw new Investigator.UnknownVM();
}
- public Status getHostStatus(final Host host) {
+ public Status getHostStatusFromHAConfig(final Host host) {
final HAConfig haConfig = haConfigDao.findHAResource(host.getId(),
HAResource.ResourceType.Host);
- if (haConfig != null) {
- if (haConfig.getState() == HAConfig.HAState.Fenced) {
- logger.debug("HA: Agent [{}] is available/suspect/checking
Up.", host);
- return Status.Down;
- } else if (haConfig.getState() == HAConfig.HAState.Degraded ||
haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() ==
HAConfig.HAState.Fencing) {
- logger.debug("HA: Agent [{}] is disconnected. State: {}, {}.",
host, haConfig.getState(), haConfig.getState().getDescription());
- return Status.Disconnected;
- }
- return Status.Up;
+ if (haConfig == null) {
+ logger.warn("HA: Agent [{}] config is not available.", host);
+ return Status.Unknown;
}
- return Status.Unknown;
+ if (haConfig.getState() == HAConfig.HAState.Fenced) {
+ logger.debug("HA: Agent [{}] is fenced.", host);
+ return Status.Down;
+ }
+ if (haConfig.getState() == HAConfig.HAState.Degraded ||
haConfig.getState() == HAConfig.HAState.Recovering || haConfig.getState() ==
HAConfig.HAState.Fencing) {
+ logger.debug("HA: Agent [{}] is disconnected. State: {}, {}.",
host, haConfig.getState(), haConfig.getState().getDescription());
+ return Status.Disconnected;
+ }
+
+ logger.debug("HA: Agent [{}] is considered Up (HA state can be
Available/Suspect/Checking/Recovered). State: {}, {}.", host,
haConfig.getState(), haConfig.getState().getDescription());
+ return Status.Up;
}
//////////////////////////////////////////////////////
@@ -511,9 +510,14 @@ public final class HAManagerImpl extends ManagerBase
implements HAManager, Clust
// Attempt recovery
if (newState == HAConfig.HAState.Recovering) {
- if (counter.getRecoveryCounter() >= (Long)
(haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
+ long recoveryCounter = counter.getRecoveryCounter();
+ Long maxRecoveryAttempts = (Long)
(haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource));
+ if (recoveryCounter >= maxRecoveryAttempts) {
+ logger.debug("Recovery attempts have reached the configured
limit: {} for the resource [{}].", maxRecoveryAttempts, resource);
return false;
}
+
+ logger.debug("Recovery attempt #{} for the resource [{}]. Max
recovery attempts configured is {}.", recoveryCounter + 1, resource,
maxRecoveryAttempts);
final RecoveryTask task = ComponentContext.inject(new
RecoveryTask(resource, haProvider, haConfig,
HAProviderConfig.RecoveryTimeout, recoveryExecutor));
final Future<Boolean> recoveryFuture =
recoveryExecutor.submit(task);
@@ -536,20 +540,20 @@ public final class HAManagerImpl extends ManagerBase
implements HAManager, Clust
return false;
}
- logger.debug(String.format("HA state pre-transition:: new state=[%s],
old state=[%s], for resource id=[%s], status=[%s], ha config state=[%s]." ,
newState, oldState, haConfig.getResourceId(), status, haConfig.getState()));
+ logger.debug("HA state pre-transition:: new state=[{}], old
state=[{}], for resource id=[{}], status=[{}], ha config state=[{}].",
newState, oldState, haConfig.getResourceId(), status, haConfig.getState());
if (status && haConfig.getState() != newState) {
- logger.warn(String.format("HA state pre-transition:: HA state is
not equal to transition state, HA state=[%s], new state=[%s].",
haConfig.getState(), newState));
+ logger.warn("HA state pre-transition:: HA state is not equal to
transition state, HA state=[{}], new state=[{}].", haConfig.getState(),
newState);
}
return processHAStateChange(haConfig, newState, status);
}
@Override
public boolean postStateTransitionEvent(final
StateMachine2.Transition<HAConfig.HAState, HAConfig.Event> transition, final
HAConfig haConfig, final boolean status, final Object opaque) {
- logger.debug(String.format("HA state post-transition:: new state=[%s],
old state=[%s], for resource id=[%s], status=[%s], ha config state=[%s].",
transition.getToState(), transition.getCurrentState(),
haConfig.getResourceId(), status, haConfig.getState()));
+ logger.debug("HA state post-transition:: new state=[{}], old
state=[{}], for resource id=[{}], status=[{}], ha config state=[{}].",
transition.getToState(), transition.getCurrentState(),
haConfig.getResourceId(), status, haConfig.getState());
if (status && haConfig.getState() != transition.getToState()) {
- logger.warn(String.format("HA state post-transition:: HA state is
not equal to transition state, HA state=[%s], new state=[%s].",
haConfig.getState(), transition.getToState()));
+ logger.warn("HA state post-transition:: HA state is not equal to
transition state, HA state=[{}], new state=[{}].", haConfig.getState(),
transition.getToState());
}
return processHAStateChange(haConfig, transition.getToState(), status);
}
@@ -645,7 +649,7 @@ public final class HAManagerImpl extends ManagerBase
implements HAManager, Clust
try {
logger.debug("HA health check task is running...");
- final List<HAConfig> haConfigList = new
ArrayList<HAConfig>(haConfigDao.listAll());
+ final List<HAConfig> haConfigList = new
ArrayList<>(haConfigDao.listAll());
for (final HAConfig haConfig : haConfigList) {
currentHaConfig = haConfig;
@@ -676,8 +680,8 @@ public final class HAManagerImpl extends ManagerBase
implements HAManager, Clust
HAProviderConfig.HealthCheckTimeout,
healthCheckExecutor));
healthCheckExecutor.submit(task);
break;
- default:
- break;
+ default:
+ break;
}
final HAResourceCounter counter =
getHACounter(haConfig.getResourceId(), haConfig.getResourceType());
@@ -695,16 +699,22 @@ public final class HAManagerImpl extends ManagerBase
implements HAManager, Clust
}
if (haConfig.getState() == HAConfig.HAState.Recovering) {
- if (counter.getRecoveryCounter() >= (Long)
(haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource))) {
+ long recoveryCounter = counter.getRecoveryCounter();
+ Long maxRecoveryAttempts = (Long)
(haProvider.getConfigValue(HAProviderConfig.MaxRecoveryAttempts, resource));
+ if (recoveryCounter >= maxRecoveryAttempts) {
+ logger.debug("Recovery attempts have reached the
max limit: {} for the resource [{}].", maxRecoveryAttempts, resource);
transitionHAState(HAConfig.Event.RecoveryOperationThresholdExceeded, haConfig);
} else {
+ logger.debug("Retry recovery for the resource
[{}]. Max recovery attempts configured is {}.", resource, maxRecoveryAttempts);
transitionHAState(HAConfig.Event.RetryRecovery,
haConfig);
}
}
if (haConfig.getState() == HAConfig.HAState.Recovered) {
counter.markRecoveryStarted();
- if
(counter.canExitRecovery((Long)(haProvider.getConfigValue(HAProviderConfig.RecoveryWaitTimeout,
resource)))) {
+ Long recoveryWaitTimeout =
(Long)(haProvider.getConfigValue(HAProviderConfig.RecoveryWaitTimeout,
resource));
+ logger.debug("Recovery started for the resource [{}],
wait period configured to become Available is {} secs", resource,
recoveryWaitTimeout);
+ if (counter.canExitRecovery(recoveryWaitTimeout)) {
if
(transitionHAState(HAConfig.Event.RecoveryWaitPeriodTimeout, haConfig)) {
counter.markRecoveryCompleted();
}
@@ -717,7 +727,7 @@ public final class HAManagerImpl extends ManagerBase
implements HAManager, Clust
}
} catch (Throwable t) {
if (currentHaConfig != null) {
- logger.error(String.format("Error trying to perform health
checks in HA manager [%s].", currentHaConfig.getHaProvider()), t);
+ logger.error("Error trying to perform health checks in HA
manager [{}].", currentHaConfig.getHaProvider(), t);
} else {
logger.error("Error trying to perform health checks in HA
manager.", t);
}
diff --git
a/server/src/main/java/org/apache/cloudstack/ha/HAResourceCounter.java
b/server/src/main/java/org/apache/cloudstack/ha/HAResourceCounter.java
index f493f6926e0..23ebbaa6a7a 100644
--- a/server/src/main/java/org/apache/cloudstack/ha/HAResourceCounter.java
+++ b/server/src/main/java/org/apache/cloudstack/ha/HAResourceCounter.java
@@ -36,6 +36,10 @@ public final class HAResourceCounter {
return activityCheckCounter.get();
}
+ public long getActivityCheckFailureCounter() {
+ return activityCheckFailureCounter.get();
+ }
+
public long getRecoveryCounter() {
return recoveryOperationCounter.get();
}
@@ -66,7 +70,7 @@ public final class HAResourceCounter {
firstHealthCheckFailureTimestamp = null;
}
- public boolean hasActivityThresholdExceeded(final double failureRatio) {
+ public boolean hasActivityFailureThresholdExceeded(final double
failureRatio) {
return activityCheckFailureCounter.get() > (activityCheckCounter.get()
* failureRatio);
}
diff --git
a/server/src/main/java/org/apache/cloudstack/ha/task/ActivityCheckTask.java
b/server/src/main/java/org/apache/cloudstack/ha/task/ActivityCheckTask.java
index 5ddbac626bc..e27d039b392 100644
--- a/server/src/main/java/org/apache/cloudstack/ha/task/ActivityCheckTask.java
+++ b/server/src/main/java/org/apache/cloudstack/ha/task/ActivityCheckTask.java
@@ -62,6 +62,8 @@ public class ActivityCheckTask extends BaseHATask {
return;
}
+ long activityCounter = counter.getActivityCheckCounter();
+ logger.debug("Activity check #{}, result: {} for the resource {}. Max
activity checks configured is {}", activityCounter + 1, result, getResource(),
maxActivityChecks);
counter.incrActivityCounter(!result);
if (counter.getActivityCheckCounter() < maxActivityChecks) {
@@ -69,7 +71,9 @@ public class ActivityCheckTask extends BaseHATask {
return;
}
- if (counter.hasActivityThresholdExceeded(activityCheckFailureRatio)) {
+ long activityCheckFailureCount =
counter.getActivityCheckFailureCounter();
+ logger.debug("{} activity checks failed out of {} checks performed for
the resource {}. Failure threshold configured is {}",
activityCheckFailureCount, maxActivityChecks, getResource(),
activityCheckFailureRatio);
+ if
(counter.hasActivityFailureThresholdExceeded(activityCheckFailureRatio)) {
haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureOverThresholdRatio,
haConfig);
} else {
if
(haManager.transitionHAState(HAConfig.Event.ActivityCheckFailureUnderThresholdRatio,
haConfig)) {
diff --git a/server/src/main/java/org/apache/cloudstack/ha/task/BaseHATask.java
b/server/src/main/java/org/apache/cloudstack/ha/task/BaseHATask.java
index 6dc7b9281ba..bf34d3514b6 100644
--- a/server/src/main/java/org/apache/cloudstack/ha/task/BaseHATask.java
+++ b/server/src/main/java/org/apache/cloudstack/ha/task/BaseHATask.java
@@ -97,7 +97,7 @@ public abstract class BaseHATask implements Callable<Boolean>
{
result = future.get(timeout, TimeUnit.SECONDS);
}
} catch (InterruptedException | ExecutionException e) {
- logger.warn("Exception occurred while running " + getTaskType() +
" on a resource: " + e.getMessage(), e.getCause());
+ logger.warn("Exception occurred while running {} on a resource:
{}", getTaskType(), e.getMessage(), e.getCause());
throwable = e.getCause();
} catch (TimeoutException e) {
logger.trace("{} operation timed out for resource: {}",
getTaskType(), resource);
diff --git
a/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java
b/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java
index 9274bd1ff08..626f2cda172 100644
--- a/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java
+++ b/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java
@@ -35,6 +35,7 @@ import
org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationSer
import
org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProviderManager;
import org.apache.cloudstack.framework.config.ConfigKey;
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
+import org.apache.cloudstack.ha.dao.HAConfigDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -118,6 +119,8 @@ public class HighAvailabilityManagerImplTest {
@Mock
ConfigurationDao _configDao;
@Mock
+ HAConfigDao _haConfigDao;
+ @Mock
VolumeOrchestrationService volumeMgr;
@Mock
ConsoleProxyManager consoleProxyManager;
@@ -362,7 +365,7 @@ public class HighAvailabilityManagerImplTest {
investigators.add(investigator);
highAvailabilityManager.setInvestigators(investigators);
// Mock isAgentAlive to return host status as Down
-
Mockito.when(investigator.isAgentAlive(hostVO)).thenReturn(Status.Down);
+
Mockito.when(investigator.getHostAgentStatus(hostVO)).thenReturn(Status.Down);
ConfigKey<Boolean> haEnabled = Mockito.mock(ConfigKey.class);
highAvailabilityManager.VmHaEnabled = haEnabled;
diff --git
a/utils/src/main/java/org/apache/cloudstack/utils/redfish/RedfishClient.java
b/utils/src/main/java/org/apache/cloudstack/utils/redfish/RedfishClient.java
index f9952b2c03b..c5567cfa2e2 100644
--- a/utils/src/main/java/org/apache/cloudstack/utils/redfish/RedfishClient.java
+++ b/utils/src/main/java/org/apache/cloudstack/utils/redfish/RedfishClient.java
@@ -231,21 +231,19 @@ public class RedfishClient {
}
protected HttpResponse retryHttpRequest(String url, HttpRequestBase
httpReq, HttpClient client) {
- logger.warn(String.format("Failed to execute HTTP %s request [URL:
%s]. Executing the request again.", httpReq.getMethod(), url));
+ logger.warn("Failed to execute HTTP {} request [URL: {}]. Executing
the request again.", httpReq.getMethod(), url);
HttpResponse response = null;
for (int attempt = 1; attempt < redfishRequestMaxRetries + 1;
attempt++) {
try {
TimeUnit.SECONDS.sleep(WAIT_FOR_REQUEST_RETRY);
- logger.debug(String.format("HTTP %s request retry attempt
%d/%d [URL: %s].", httpReq.getMethod(), attempt, redfishRequestMaxRetries,
url));
+ logger.debug("HTTP {} request retry attempt {}/{} [URL: {}].",
httpReq.getMethod(), attempt, redfishRequestMaxRetries, url);
response = client.execute(httpReq);
break;
} catch (IOException | InterruptedException e) {
if (attempt == redfishRequestMaxRetries) {
throw new RedfishException(String.format("Failed to
execute HTTP %s request retry attempt %d/%d [URL: %s] due to exception %s",
httpReq.getMethod(), attempt, redfishRequestMaxRetries,url, e));
} else {
- logger.warn(
- String.format("Failed to execute HTTP %s request
retry attempt %d/%d [URL: %s] due to exception %s", httpReq.getMethod(),
attempt, redfishRequestMaxRetries,
- url, e));
+ logger.warn("Failed to execute HTTP {} request retry
attempt {}/{} [URL: {}] due to exception {}", httpReq.getMethod(), attempt,
redfishRequestMaxRetries, url, e);
}
}
}
@@ -312,7 +310,7 @@ public class RedfishClient {
throw new RedfishException(String.format("Failed to execute System
power command for host by performing '%s' request on URL '%s' and host address
'%s'. The expected HTTP status code is '%s' but it got '%s'.",
HttpPost.METHOD_NAME, url, hostAddress,
EXPECTED_HTTP_STATUS, statusCode));
}
- logger.debug(String.format("Sending ComputerSystem.Reset Command '%s'
to host '%s' with request '%s %s'", resetCommand, hostAddress,
HttpPost.METHOD_NAME, url));
+ logger.debug("Sending ComputerSystem.Reset Command '{}' to host '{}'
with request '{} {}'", resetCommand, hostAddress, HttpPost.METHOD_NAME, url);
}
/**
@@ -330,7 +328,7 @@ public class RedfishClient {
String systemId = processGetSystemIdResponse(response);
- logger.debug(String.format("Retrieved System ID '%s' with request '%s:
%s'", systemId, HttpGet.METHOD_NAME, url));
+ logger.debug("Retrieved System ID '{}' with request '{}: {}'",
systemId, HttpGet.METHOD_NAME, url);
return systemId;
}
@@ -384,7 +382,7 @@ public class RedfishClient {
}
RedfishPowerState powerState =
processGetSystemRequestResponse(response);
- logger.debug(String.format("Retrieved System power state '%s' with
request '%s: %s'", powerState, HttpGet.METHOD_NAME, url));
+ logger.debug("Retrieved System power state '{}' with request '{}:
{}'", powerState, HttpGet.METHOD_NAME, url);
return powerState;
}