From e37856627ec23d94f0b8718189b3e7b6c1f521f1 Mon Sep 17 00:00:00 2001 From: Abhinandan Prateek Date: Fri, 25 Mar 2016 14:19:25 +0530 Subject: [PATCH] CLOUDSTACK-9323: Cancelling maintenance when prepare for maintenance is not yet complete throws exception as VMs are still in migration. There is no point in throwing exception, instead cancel all the pending migration tasks and then cancel maintance. The current migration will complete as normal migration jobs. --- .../cloud/resource/ResourceManagerImpl.java | 10 +- .../component/test_host_maintenance.py | 325 ++++++++++++++++++ tools/marvin/marvin/lib/utils.py | 6 +- 3 files changed, 334 insertions(+), 7 deletions(-) create mode 100644 test/integration/component/test_host_maintenance.py diff --git a/server/src/com/cloud/resource/ResourceManagerImpl.java b/server/src/com/cloud/resource/ResourceManagerImpl.java index b0b18490cab..7fb6d1c3956 100755 --- a/server/src/com/cloud/resource/ResourceManagerImpl.java +++ b/server/src/com/cloud/resource/ResourceManagerImpl.java @@ -2107,11 +2107,13 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, /* TODO: move to listener */ _haMgr.cancelScheduledMigrations(host); + + boolean vms_migrating=false; List vms = _haMgr.findTakenMigrationWork(); for (VMInstanceVO vm : vms) { - if (vm != null && vm.getHostId() != null && vm.getHostId() == hostId) { - s_logger.info("Unable to cancel migration because the vm is being migrated: " + vm); - return false; + if (vm.getHostId() != null && vm.getHostId() == hostId) { + s_logger.warn("Cancel host maintenance: Migrations scheduled for " + vm + ", hostId = " + hostId); + vms_migrating=true; } } @@ -2120,7 +2122,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, _agentMgr.pullAgentOutMaintenance(hostId); // for kvm, need to log into kvm host, restart cloudstack-agent - if (host.getHypervisorType() == HypervisorType.KVM || host.getHypervisorType() == HypervisorType.LXC) { + if ((host.getHypervisorType() == HypervisorType.KVM && ! vms_migrating) || host.getHypervisorType() == HypervisorType.LXC) { boolean sshToAgent = Boolean.parseBoolean(_configDao.getValue(Config.KvmSshToAgentEnabled.key())); if (!sshToAgent) { diff --git a/test/integration/component/test_host_maintenance.py b/test/integration/component/test_host_maintenance.py new file mode 100644 index 00000000000..55fe1d05e1b --- /dev/null +++ b/test/integration/component/test_host_maintenance.py @@ -0,0 +1,325 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" BVT tests for Hosts Maintenance +""" + +# Import Local Modules +from marvin.codes import FAILED +from marvin.cloudstackTestCase import * +from marvin.cloudstackAPI import * +from marvin.lib.utils import * +from marvin.lib.base import * +from marvin.lib.common import * +from nose.plugins.attrib import attr + +from time import sleep + +_multiprocess_shared_ = False + + +class TestHostMaintenance(cloudstackTestCase): + + def setUp(self): + self.logger = logging.getLogger('TestHM') + self.stream_handler = logging.StreamHandler() + self.logger.setLevel(logging.DEBUG) + self.logger.addHandler(self.stream_handler) + self.apiclient = self.testClient.getApiClient() + self.hypervisor = self.testClient.getHypervisorInfo() + self.dbclient = self.testClient.getDbConnection() + self.services = self.testClient.getParsedTestDataConfig() + self.zone = get_zone(self.apiclient, self.testClient.getZoneForTests()) + self.pod = get_pod(self.apiclient, self.zone.id) + self.cleanup = [] + self.services = { + "service_offering": { + "name": "Ultra Tiny Instance", + "displaytext": "Ultra Tiny Instance", + "cpunumber": 1, + "cpuspeed": 100, + "memory": 128, + }, + "vm": { + "username": "root", + "password": "password", + "ssh_port": 22, + # Hypervisor type should be same as + # hypervisor type of cluster + "privateport": 22, + "publicport": 22, + "protocol": 'TCP', + }, + "natrule": { + "privateport": 22, + "publicport": 22, + "startport": 22, + "endport": 22, + "protocol": "TCP", + "cidrlist": '0.0.0.0/0', + }, + "ostype": 'CentOS 5.3 (64-bit)', + "sleep": 60, + "timeout": 10, + } + + + def tearDown(self): + try: + # Clean up, terminate the created templates + cleanup_resources(self.apiclient, self.cleanup) + + except Exception as e: + raise Exception("Warning: Exception during cleanup : %s" % e) + + return + + def createVMs(self, hostId, number): + + self.template = get_template( + self.apiclient, + self.zone.id, + self.services["ostype"] + ) + + if self.template == FAILED: + assert False, "get_template() failed to return template with description %s" % self.services["ostype"] + + self.logger.debug("Using template %s " % self.template.id) + + self.service_offering = ServiceOffering.create( + self.apiclient, + self.services["service_offering"] + ) + self.logger.debug("Using service offering %s " % self.service_offering.id) + + vms=[] + for i in range(0, number): + self.services["vm"]["zoneid"] = self.zone.id + self.services["vm"]["template"] = self.template.id + self.services["vm"]["displayname"] = 'vm' + str(i) + self.services["vm"]["hypervisor"] = self.hypervisor + vm = VirtualMachine.create( + self.apiclient, + self.services["vm"], + serviceofferingid=self.service_offering.id, + hostid=hostId + ) + vms.append(vm) + self.cleanup.append(vm) + self.logger.debug("VM create = {}".format(vm.id)) + return vms + + def checkVmMigratingOnHost(self, hostId): + vm_migrating=False + listVms1 = VirtualMachine.list( + self.apiclient, + hostid=hostId + ) + + if (listVms1 is not None): + self.logger.debug('Vms found = {} '.format(len(listVms1))) + for vm in listVms1: + if (vm.state == "Migrating"): + self.logger.debug('VirtualMachine on Hyp id = {} is in {}'.format(vm.id, vm.state)) + vm_migrating=True + break + + return (vm_migrating,) + + def NoOfVMsOnHost(self, hostId): + listVms = VirtualMachine.list( + self.apiclient, + hostid=hostId + ) + no_of_vms=0 + if (listVms is not None): + for vm in listVms: + self.logger.debug('VirtualMachine on Hyp 1 = {}'.format(vm.id)) + no_of_vms=no_of_vms+1 + + return no_of_vms + + @attr( + tags=[ + "advanced", + "advancedns", + "smoke", + "basic", + "eip", + "sg"], + required_hardware="true") + def test_01_cancel_host_maintenace_with_no_migration_jobs(self): + listHost = Host.list( + self.apiclient, + type='Routing', + zoneid=self.zone.id, + podid=self.pod.id, + ) + for host in listHost: + self.logger.debug('1 Hypervisor = {}'.format(host.id)) + + + if (len(listHost) < 2): + raise unittest.SkipTest("Cancel host maintenance when VMs are migrating should be tested for 2 or more hosts"); + return + + vm_migrating=False + + try: + + target_host_id = listHost[0].id + other_host_id = listHost[1].id + + cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd() + cmd.id = target_host_id + response = self.apiclient.prepareHostForMaintenance(cmd) + + self.logger.debug('Host with id {} is in prepareHostForMaintenance'.format(target_host_id)) + + #as soon as VM is picked for migration its last hostid is updated to the new host + # that is why VM shows up as migrating on the other host + vm_migrating = wait_until(1, 10, self.checkVmMigratingOnHost, other_host_id) + + cmd = cancelHostMaintenance.cancelHostMaintenanceCmd() + cmd.id = target_host_id + response = self.apiclient.cancelHostMaintenance(cmd) + + self.logger.debug('Host with id {} is in cancelHostMaintenance'.format(target_host_id) ) + + + target_host_id = listHost[1].id + other_host_id = listHost[0].id + + cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd() + cmd.id = target_host_id + response = self.apiclient.prepareHostForMaintenance(cmd) + + self.logger.debug('Host with id {} is in prepareHostForMaintenance'.format(target_host_id)) + + vm_migrating = wait_until(1, 10, self.checkVmMigratingOnHost, other_host_id) + + cmd = cancelHostMaintenance.cancelHostMaintenanceCmd() + cmd.id = target_host_id + response = self.apiclient.cancelHostMaintenance(cmd) + + self.logger.debug('Host with id {} is in cancelHostMaintenance'.format(target_host_id) ) + + + except Exception as e: + self.logger.debug("Exception {}".format(e)) + self.fail("Cancel host maintenance failed {}".format(e[0])) + + + if (vm_migrating == True): + raise unittest.SkipTest("VMs are migrating and the test will not be able to check the conditions the test is intended for"); + + + return + + + + + @attr( + tags=[ + "advanced", + "advancedns", + "smoke", + "basic", + "eip", + "sg"], + required_hardware="true") + def test_02_cancel_host_maintenace_with_migration_jobs(self): + + listHost = Host.list( + self.apiclient, + type='Routing', + zoneid=self.zone.id, + podid=self.pod.id, + ) + for host in listHost: + self.logger.debug('2 Hypervisor = {}'.format(host.id)) + + if (len(listHost) != 2): + raise unittest.SkipTest("Cancel host maintenance when VMs are migrating can only be tested with 2 hosts"); + return + + + no_of_vms = self.NoOfVMsOnHost(listHost[0].id) + + no_of_vms = no_of_vms + self.NoOfVMsOnHost(listHost[0].id) + + if no_of_vms < 5: + self.logger.debug("Create VMs as there are not enough vms to check host maintenance") + no_vm_req = 5 - no_of_vms + if (no_vm_req > 0): + self.logger.debug("Creating vms = {}".format(no_vm_req)) + self.vmlist = self.createVMs(listHost[0].id, no_vm_req) + + vm_migrating=False + + try: + + target_host_id = listHost[0].id + other_host_id = listHost[1].id + + cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd() + cmd.id = target_host_id + response = self.apiclient.prepareHostForMaintenance(cmd) + + self.logger.debug('Host with id {} is in prepareHostForMaintenance'.format(target_host_id)) + + #as soon as VM is picked for migration its last hostid is updated to the new host + # that is why VM shows up as migrating on the other host + vm_migrating = wait_until(1, 10, self.checkVmMigratingOnHost, other_host_id) + + cmd = cancelHostMaintenance.cancelHostMaintenanceCmd() + cmd.id = target_host_id + response = self.apiclient.cancelHostMaintenance(cmd) + + self.logger.debug('Host with id {} is in cancelHostMaintenance'.format(target_host_id) ) + + + target_host_id = listHost[1].id + other_host_id = listHost[0].id + + cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd() + cmd.id = target_host_id + response = self.apiclient.prepareHostForMaintenance(cmd) + + self.logger.debug('Host with id {} is in prepareHostForMaintenance'.format(target_host_id)) + + vm_migrating = wait_until(1, 10, self.checkVmMigratingOnHost, other_host_id) + + cmd = cancelHostMaintenance.cancelHostMaintenanceCmd() + cmd.id = target_host_id + response = self.apiclient.cancelHostMaintenance(cmd) + + self.logger.debug('Host with id {} is in cancelHostMaintenance'.format(target_host_id) ) + + + except Exception as e: + self.logger.debug("Exception {}".format(e)) + self.fail("Cancel host maintenance failed {}".format(e[0])) + + + if (vm_migrating == False): + raise unittest.SkipTest("No VM is migrating and the test will not be able to check the conditions the test is intended for"); + + + return + + diff --git a/tools/marvin/marvin/lib/utils.py b/tools/marvin/marvin/lib/utils.py index 7f1522b7d26..8f14333251c 100644 --- a/tools/marvin/marvin/lib/utils.py +++ b/tools/marvin/marvin/lib/utils.py @@ -506,15 +506,15 @@ def verifyRouterState(apiclient, routerid, allowedstates): (allowedstates, routers[0].redundantstate)] return [PASS, None] - + def wait_until(retry_interval=2, no_of_times=2, callback=None, *callback_args): """ Utility method to try out the callback method at most no_of_times with a interval of retry_interval, Will return immediately if callback returns True. The callback method should be written to return a list of values first being a boolean """ if callback is None: raise ("Bad value for callback method !") - - wait_result = False + + wait_result = False for i in range(0,no_of_times): time.sleep(retry_interval) wait_result, return_val = callback(*callback_args)