[ https://issues.apache.org/jira/browse/CLOUDSTACK-10326?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16518231#comment-16518231 ]
ASF GitHub Bot commented on CLOUDSTACK-10326: --------------------------------------------- nvazquez closed pull request #2493: CLOUDSTACK-10326: Prevent hosts fall into Maintenance when there are running VMs on it URL: https://github.com/apache/cloudstack/pull/2493 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/engine/schema/src/com/cloud/vm/dao/VMInstanceDao.java b/engine/schema/src/com/cloud/vm/dao/VMInstanceDao.java index 69efea42df9..6fda4a15c32 100755 --- a/engine/schema/src/com/cloud/vm/dao/VMInstanceDao.java +++ b/engine/schema/src/com/cloud/vm/dao/VMInstanceDao.java @@ -150,4 +150,6 @@ VMInstanceVO findVMByHostNameInZone(String hostName, long zoneId); boolean isPowerStateUpToDate(long instanceId); + + List<VMInstanceVO> listNonMigratingVmsByHostEqualsLastHost(long hostId); } diff --git a/engine/schema/src/com/cloud/vm/dao/VMInstanceDaoImpl.java b/engine/schema/src/com/cloud/vm/dao/VMInstanceDaoImpl.java index 6e97d1275a6..1565f53233b 100755 --- a/engine/schema/src/com/cloud/vm/dao/VMInstanceDaoImpl.java +++ b/engine/schema/src/com/cloud/vm/dao/VMInstanceDaoImpl.java @@ -93,6 +93,7 @@ protected GenericSearchBuilder<VMInstanceVO, String> DistinctHostNameSearch; protected SearchBuilder<VMInstanceVO> HostAndStateSearch; protected SearchBuilder<VMInstanceVO> StartingWithNoHostSearch; + protected SearchBuilder<VMInstanceVO> NotMigratingSearch; @Inject ResourceTagDao _tagsDao; @@ -280,6 +281,11 @@ protected void init() { DistinctHostNameSearch.join("nicSearch", nicSearch, DistinctHostNameSearch.entity().getId(), nicSearch.entity().getInstanceId(), JoinBuilder.JoinType.INNER); DistinctHostNameSearch.done(); + NotMigratingSearch = createSearchBuilder(); + NotMigratingSearch.and("host", NotMigratingSearch.entity().getHostId(), Op.EQ); + NotMigratingSearch.and("lastHost", NotMigratingSearch.entity().getLastHostId(), Op.EQ); + NotMigratingSearch.and("state", NotMigratingSearch.entity().getState(), Op.NEQ); + NotMigratingSearch.done(); } @Override @@ -304,6 +310,15 @@ protected void init() { return listBy(sc); } + @Override + public List<VMInstanceVO> listNonMigratingVmsByHostEqualsLastHost(long hostId) { + SearchCriteria<VMInstanceVO> sc = NotMigratingSearch.create(); + sc.setParameters("host", hostId); + sc.setParameters("lastHost", hostId); + sc.setParameters("state", State.Migrating); + return listBy(sc); + } + @Override public List<VMInstanceVO> listByZoneId(long zoneId) { SearchCriteria<VMInstanceVO> sc = AllFieldsSearch.create(); diff --git a/server/src/com/cloud/resource/ResourceManagerImpl.java b/server/src/com/cloud/resource/ResourceManagerImpl.java index 2966d41d8bf..ba3c157cf9f 100755 --- a/server/src/com/cloud/resource/ResourceManagerImpl.java +++ b/server/src/com/cloud/resource/ResourceManagerImpl.java @@ -30,6 +30,7 @@ import javax.inject.Inject; import javax.naming.ConfigurationException; +import com.cloud.vm.dao.UserVmDetailsDao; import org.apache.commons.lang.ObjectUtils; import org.apache.log4j.Logger; import org.springframework.stereotype.Component; @@ -54,6 +55,8 @@ import com.cloud.agent.AgentManager; import com.cloud.agent.api.Answer; import com.cloud.agent.api.Command; +import com.cloud.agent.api.GetVncPortCommand; +import com.cloud.agent.api.GetVncPortAnswer; import com.cloud.agent.api.GetGPUStatsAnswer; import com.cloud.agent.api.GetGPUStatsCommand; import com.cloud.agent.api.GetHostStatsAnswer; @@ -252,6 +255,8 @@ public void setDiscoverers(final List<? extends Discoverer> discoverers) { private ConfigurationManager _configMgr; @Inject private ClusterVSMMapDao _clusterVSMMapDao; + @Inject + private UserVmDetailsDao userVmDetailsDao; private final long _nodeId = ManagementServerNode.getManagementServerId(); @@ -1287,6 +1292,68 @@ public Host maintain(final PrepareForMaintenanceCmd cmd) { } } + /** + * Add VNC details as user VM details for each VM in 'vms' (KVM hosts only) + */ + protected void setKVMVncAccess(long hostId, List<VMInstanceVO> vms) { + for (VMInstanceVO vm : vms) { + GetVncPortAnswer vmVncPortAnswer = (GetVncPortAnswer) _agentMgr.easySend(hostId, new GetVncPortCommand(vm.getId(), vm.getInstanceName())); + if (vmVncPortAnswer != null) { + userVmDetailsDao.addDetail(vm.getId(), "kvm.vnc.address", vmVncPortAnswer.getAddress(), true); + userVmDetailsDao.addDetail(vm.getId(), "kvm.vnc.port", String.valueOf(vmVncPortAnswer.getPort()), true); + } + } + } + + /** + * Configure VNC access for host VMs which have failed migrating to another host while trying to enter Maintenance mode + */ + protected void configureVncAccessForKVMHostFailedMigrations(HostVO host, List<VMInstanceVO> failedMigrations) { + if (host.getHypervisorType().equals(HypervisorType.KVM)) { + _agentMgr.pullAgentOutMaintenance(host.getId()); + setKVMVncAccess(host.getId(), failedMigrations); + _agentMgr.pullAgentToMaintenance(host.getId()); + } + } + + /** + * Set host into ErrorInMaintenance state, as errors occurred during VM migrations. Do the following: + * - Cancel scheduled migrations for those which have already failed + * - Configure VNC access for VMs (KVM hosts only) + */ + protected boolean setHostIntoErrorInMaintenance(HostVO host, List<VMInstanceVO> failedMigrations) throws NoTransitionException { + s_logger.debug("Unable to migrate " + failedMigrations.size() + " VM(s) from host " + host.getUuid()); + _haMgr.cancelScheduledMigrations(host); + configureVncAccessForKVMHostFailedMigrations(host, failedMigrations); + resourceStateTransitTo(host, ResourceState.Event.UnableToMigrate, _nodeId); + return false; + } + + /** + * Safely transit host into Maintenance mode + */ + protected boolean setHostIntoMaintenance(HostVO host) throws NoTransitionException { + s_logger.debug("Host " + host.getUuid() + " entering in Maintenance"); + resourceStateTransitTo(host, ResourceState.Event.InternalEnterMaintenance, _nodeId); + ActionEventUtils.onCompletedActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(), + EventVO.LEVEL_INFO, EventTypes.EVENT_MAINTENANCE_PREPARE, + "completed maintenance for host " + host.getId(), 0); + return true; + } + + /** + * Return true if host goes into Maintenance mode, only when: + * - No Running, Migrating or Failed migrations (host_id = last_host_id) for the host + */ + protected boolean isHostInMaintenance(HostVO host, List<VMInstanceVO> runningVms, List<VMInstanceVO> migratingVms, List<VMInstanceVO> failedMigrations) throws NoTransitionException { + if (CollectionUtils.isEmpty(runningVms) && CollectionUtils.isEmpty(migratingVms)) { + return CollectionUtils.isEmpty(failedMigrations) ? + setHostIntoMaintenance(host) : + setHostIntoErrorInMaintenance(host, failedMigrations); + } + return false; + } + @Override public boolean checkAndMaintain(final long hostId) { boolean hostInMaintenance = false; @@ -1296,11 +1363,9 @@ public boolean checkAndMaintain(final long hostId) { if (host.getType() != Host.Type.Storage) { final List<VMInstanceVO> vos = _vmDao.listByHostId(hostId); final List<VMInstanceVO> vosMigrating = _vmDao.listVmsMigratingFromHost(hostId); - if (vos.isEmpty() && vosMigrating.isEmpty()) { - resourceStateTransitTo(host, ResourceState.Event.InternalEnterMaintenance, _nodeId); - hostInMaintenance = true; - ActionEventUtils.onCompletedActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(), EventVO.LEVEL_INFO, EventTypes.EVENT_MAINTENANCE_PREPARE, "completed maintenance for host " + hostId, 0); - } + final List<VMInstanceVO> failedVmMigrations = _vmDao.listNonMigratingVmsByHostEqualsLastHost(hostId); + + hostInMaintenance = isHostInMaintenance(host, vos, vosMigrating, failedVmMigrations); } } catch (final NoTransitionException e) { s_logger.debug("Cannot transmit host " + host.getId() + "to Maintenance state", e); diff --git a/server/src/com/cloud/servlet/ConsoleProxyServlet.java b/server/src/com/cloud/servlet/ConsoleProxyServlet.java index cc788c7b118..8cfaa9fd69b 100644 --- a/server/src/com/cloud/servlet/ConsoleProxyServlet.java +++ b/server/src/com/cloud/servlet/ConsoleProxyServlet.java @@ -35,6 +35,7 @@ import javax.servlet.http.HttpServletResponse; import javax.servlet.http.HttpSession; +import com.cloud.resource.ResourceState; import org.apache.commons.codec.binary.Base64; import org.apache.log4j.Logger; import org.springframework.stereotype.Component; @@ -418,7 +419,14 @@ private String composeConsoleAccessUrl(String rootUrl, VirtualMachine vm, HostVO StringBuffer sb = new StringBuffer(rootUrl); String host = hostVo.getPrivateIpAddress(); - Pair<String, Integer> portInfo = _ms.getVncPort(vm); + Pair<String, Integer> portInfo; + if (hostVo.getResourceState().equals(ResourceState.ErrorInMaintenance)) { + UserVmDetailVO detailAddress = _userVmDetailsDao.findDetail(vm.getId(), "kvm.vnc.address"); + UserVmDetailVO detailPort = _userVmDetailsDao.findDetail(vm.getId(), "kvm.vnc.port"); + portInfo = new Pair<>(detailAddress.getValue(), Integer.valueOf(detailPort.getValue())); + } else { + portInfo = _ms.getVncPort(vm); + } if (s_logger.isDebugEnabled()) s_logger.debug("Port info " + portInfo.first()); diff --git a/server/test/com/cloud/resource/ResourceManagerImplTest.java b/server/test/com/cloud/resource/ResourceManagerImplTest.java new file mode 100644 index 00000000000..525ccd0c071 --- /dev/null +++ b/server/test/com/cloud/resource/ResourceManagerImplTest.java @@ -0,0 +1,191 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package com.cloud.resource; + +import com.cloud.agent.AgentManager; +import com.cloud.agent.api.GetVncPortAnswer; +import com.cloud.agent.api.GetVncPortCommand; +import com.cloud.capacity.dao.CapacityDao; +import com.cloud.event.ActionEventUtils; +import com.cloud.ha.HighAvailabilityManager; +import com.cloud.host.Host; +import com.cloud.host.HostVO; +import com.cloud.host.dao.HostDao; +import com.cloud.hypervisor.Hypervisor; +import com.cloud.storage.StorageManager; +import com.cloud.utils.fsm.NoTransitionException; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.dao.UserVmDetailsDao; +import com.cloud.vm.dao.VMInstanceDao; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.BDDMockito; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.mockito.Spy; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.junit4.PowerMockRunner; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static com.cloud.resource.ResourceState.Event.InternalEnterMaintenance; +import static com.cloud.resource.ResourceState.Event.UnableToMigrate; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.anyBoolean; +import static org.mockito.Matchers.anyLong; +import static org.mockito.Matchers.anyString; +import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +@RunWith(PowerMockRunner.class) +@PrepareForTest({ActionEventUtils.class, ResourceManagerImpl.class}) +public class ResourceManagerImplTest { + + @Mock + private CapacityDao capacityDao; + @Mock + private StorageManager storageManager; + @Mock + private HighAvailabilityManager haManager; + @Mock + private UserVmDetailsDao userVmDetailsDao; + @Mock + private AgentManager agentManager; + @Mock + private HostDao hostDao; + @Mock + private VMInstanceDao vmInstanceDao; + + @Spy + @InjectMocks + private ResourceManagerImpl resourceManager = new ResourceManagerImpl(); + + @Mock + private HostVO host; + @Mock + private VMInstanceVO vm1; + @Mock + private VMInstanceVO vm2; + + @Mock + private GetVncPortAnswer getVncPortAnswerVm1; + @Mock + private GetVncPortAnswer getVncPortAnswerVm2; + @Mock + private GetVncPortCommand getVncPortCommandVm1; + @Mock + private GetVncPortCommand getVncPortCommandVm2; + + private static long hostId = 1L; + + private static long vm1Id = 1L; + private static String vm1InstanceName = "i-1-VM"; + private static long vm2Id = 2L; + private static String vm2InstanceName = "i-2-VM"; + + private static String vm1VncAddress = "10.2.2.2"; + private static int vm1VncPort = 5900; + private static String vm2VncAddress = "10.2.2.2"; + private static int vm2VncPort = 5901; + + @Before + public void setup() throws Exception { + MockitoAnnotations.initMocks(this); + when(host.getType()).thenReturn(Host.Type.Routing); + when(host.getId()).thenReturn(hostId); + when(host.getResourceState()).thenReturn(ResourceState.Enabled); + when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.VMware); + when(hostDao.findById(hostId)).thenReturn(host); + when(vm1.getId()).thenReturn(vm1Id); + when(vm2.getId()).thenReturn(vm2Id); + when(vm1.getInstanceName()).thenReturn(vm1InstanceName); + when(vm2.getInstanceName()).thenReturn(vm2InstanceName); + when(vmInstanceDao.listByHostId(hostId)).thenReturn(new ArrayList<>()); + when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(new ArrayList<>()); + when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(hostId)).thenReturn(new ArrayList<>()); + PowerMockito.mockStatic(ActionEventUtils.class); + BDDMockito.given(ActionEventUtils.onCompletedActionEvent(anyLong(), anyLong(), anyString(), anyString(), anyString(), anyLong())) + .willReturn(1L); + when(getVncPortAnswerVm1.getAddress()).thenReturn(vm1VncAddress); + when(getVncPortAnswerVm1.getPort()).thenReturn(vm1VncPort); + when(getVncPortAnswerVm2.getAddress()).thenReturn(vm2VncAddress); + when(getVncPortAnswerVm2.getPort()).thenReturn(vm2VncPort); + PowerMockito.whenNew(GetVncPortCommand.class).withArguments(vm1Id, vm1InstanceName).thenReturn(getVncPortCommandVm1); + PowerMockito.whenNew(GetVncPortCommand.class).withArguments(vm2Id, vm2InstanceName).thenReturn(getVncPortCommandVm2); + when(agentManager.easySend(eq(hostId), eq(getVncPortCommandVm1))).thenReturn(getVncPortAnswerVm1); + when(agentManager.easySend(eq(hostId), eq(getVncPortCommandVm2))).thenReturn(getVncPortAnswerVm2); + } + + @Test + public void testCheckAndMaintainEnterMaintenanceMode() throws NoTransitionException { + boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); + verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), new ArrayList<>(), new ArrayList<>()); + verify(resourceManager).setHostIntoMaintenance(host); + verify(resourceManager).resourceStateTransitTo(eq(host), eq(InternalEnterMaintenance), anyLong()); + Assert.assertTrue(enterMaintenanceMode); + } + + @Test + public void testCheckAndMaintainErrorInMaintenanceRunningVms() throws NoTransitionException { + when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2)); + boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); + verify(resourceManager).isHostInMaintenance(host, Arrays.asList(vm1, vm2), new ArrayList<>(), new ArrayList<>()); + Assert.assertFalse(enterMaintenanceMode); + } + + @Test + public void testCheckAndMaintainErrorInMaintenanceMigratingVms() throws NoTransitionException { + when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm1, vm2)); + boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); + verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), Arrays.asList(vm1, vm2), new ArrayList<>()); + Assert.assertFalse(enterMaintenanceMode); + } + + @Test + public void testCheckAndMaintainErrorInMaintenanceFailedMigrations() throws NoTransitionException { + when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(hostId)).thenReturn(Arrays.asList(vm1, vm2)); + boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); + verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), new ArrayList<>(), Arrays.asList(vm1, vm2)); + verify(resourceManager).setHostIntoErrorInMaintenance(host, Arrays.asList(vm1, vm2)); + verify(resourceManager).resourceStateTransitTo(eq(host), eq(UnableToMigrate), anyLong()); + Assert.assertFalse(enterMaintenanceMode); + } + + @Test + public void testConfigureVncAccessForKVMHostFailedMigrations() { + when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.KVM); + List<VMInstanceVO> vms = Arrays.asList(vm1, vm2); + resourceManager.configureVncAccessForKVMHostFailedMigrations(host, vms); + verify(agentManager).pullAgentOutMaintenance(hostId); + verify(resourceManager).setKVMVncAccess(hostId, vms); + verify(agentManager, times(vms.size())).easySend(eq(hostId), any(GetVncPortCommand.class)); + verify(userVmDetailsDao).addDetail(eq(vm1Id), eq("kvm.vnc.address"), eq(vm1VncAddress), anyBoolean()); + verify(userVmDetailsDao).addDetail(eq(vm1Id), eq("kvm.vnc.port"), eq(String.valueOf(vm1VncPort)), anyBoolean()); + verify(userVmDetailsDao).addDetail(eq(vm2Id), eq("kvm.vnc.address"), eq(vm2VncAddress), anyBoolean()); + verify(userVmDetailsDao).addDetail(eq(vm2Id), eq("kvm.vnc.port"), eq(String.valueOf(vm2VncPort)), anyBoolean()); + verify(agentManager).pullAgentToMaintenance(hostId); + } +} ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Prevent hosts fall into Maintenance when there are running VMs on it > -------------------------------------------------------------------- > > Key: CLOUDSTACK-10326 > URL: https://issues.apache.org/jira/browse/CLOUDSTACK-10326 > Project: CloudStack > Issue Type: Bug > Security Level: Public(Anyone can view this level - this is the > default.) > Affects Versions: 4.11.0.0 > Reporter: Nicolas Vazquez > Assignee: Nicolas Vazquez > Priority: Major > Fix For: 4.11.1.0 > > Attachments: CLOUDSTACK-10326-Debug.png, > CLOUDSTACK-10326-InitialState.png, CLOUDSTACK-10326-Migrating.png, > CLOUDSTACK-10326-MigrationFailed.png > > > This issue was discovered, fixed and tested on KVM, but applies for every > hypervisor. > h2. Background > When enabling maintenance mode in a host, host state is put into > 'PrepareForMaintenance' and running VMs are migrated into another host. After > every VM is migrated, host goes to 'Maintenance' state. > Checks are performed on ResourceManagerImpl.checkAndMaintan() method: > * List VMs with host_id = HOST_ID > * List VMs with last_host_id = HOST_ID and state=Migrating > When both queries are empty, then the host can be put into Maintenance. > When a VM is being migrated to DEST_HOST, its host_id column is set to > DEST_HOST, last_host_id = ORIGIN_HOST and state = Migrating. If then > migration fails, host_id = last_host_id = ORIGIN_HOST > h2. Issue > This sequence: > * Enable maintenance mode on ORIGIN_HOST > * VMs start being migrated to a host, say DEST_HOST > * checkAndMaintain() starts: > ** First check passes (no VM with host_id = ORIGIN_HOST_ID as those are > being migrated) > ** Before the second check, one or more migrations fail > ** Second check passes, however there are VMs running on the host as > migrations have failed. > * Host goes into Maintenance state. > Screenshots attached, query executed on each case: > select id, name, instance_name, state, host_id, last_host_id from vm_instance; -- This message was sent by Atlassian JIRA (v7.6.3#76005)