mirror of https://github.com/apache/cloudstack.git
536 lines
20 KiB
Python
536 lines
20 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
|
|
from marvin.cloudstackTestCase import *
|
|
from marvin.lib.utils import *
|
|
from marvin.lib.base import *
|
|
from marvin.lib.common import *
|
|
from nose.plugins.attrib import attr
|
|
|
|
from ipmisim.ipmisim import IpmiServerContext, IpmiServer, ThreadedIpmiServer
|
|
|
|
import random
|
|
import socket
|
|
import thread
|
|
|
|
|
|
class TestHaKVMAgent(cloudstackTestCase):
|
|
""" Test cases for out of band management
|
|
"""
|
|
|
|
def setUp(self):
|
|
testClient = super(TestHaKVMAgent, self).getClsTestClient()
|
|
|
|
self.apiClient = testClient.getApiClient()
|
|
self.dbclient = testClient.getDbConnection()
|
|
self.services = testClient.getParsedTestDataConfig()
|
|
|
|
self.zone = get_zone(self.apiClient, testClient.getZoneForTests())
|
|
self.host = self.getHost()
|
|
self.cluster_id = self.host.clusterid
|
|
self.server = None
|
|
|
|
self.hypervisor = self.testClient.getHypervisorInfo()
|
|
self.mgtSvrDetails = self.config.__dict__["mgtSvr"][0].__dict__
|
|
self.hostConfig = self.config.__dict__["zones"][0].__dict__["pods"][0].__dict__["clusters"][0].__dict__["hosts"][0].__dict__
|
|
self.fakeMsId = random.randint(10000, 99999) * random.randint(10, 20)
|
|
|
|
# Cleanup any existing configs
|
|
self.dbclient.execute("delete from ha_config where resource_type='Host'")
|
|
|
|
# use random port for ipmisim
|
|
s = socket.socket()
|
|
s.bind(('', 0))
|
|
self.serverPort = s.getsockname()[1]
|
|
s.close()
|
|
|
|
# Set Cluster-level setting in order to run tests faster
|
|
self.update_configuration("kvm.ha.activity.check.failure.ratio", "0.7")
|
|
self.update_configuration("kvm.ha.activity.check.interval", "10")
|
|
self.update_configuration("kvm.ha.activity.check.max.attempts", "5")
|
|
self.update_configuration("kvm.ha.activity.check.timeout", "60")
|
|
self.update_configuration("kvm.ha.degraded.max.period", "30")
|
|
self.update_configuration("kvm.ha.fence.timeout", "60")
|
|
self.update_configuration("kvm.ha.health.check.timeout", "10")
|
|
self.update_configuration("kvm.ha.recover.failure.threshold", "1")
|
|
self.update_configuration("kvm.ha.recover.timeout", "120")
|
|
self.update_configuration("kvm.ha.recover.wait.period", "60")
|
|
|
|
self.service_offering = ServiceOffering.create(
|
|
self.apiClient,
|
|
self.services["service_offerings"]
|
|
)
|
|
|
|
self.template = get_template(
|
|
self.apiClient,
|
|
self.zone.id,
|
|
self.services["ostype"]
|
|
)
|
|
|
|
self.cleanup = [self.service_offering]
|
|
|
|
def tearDown(self):
|
|
try:
|
|
self.dbclient.execute("delete from mshost_peer where peer_runid=%s" % self.getFakeMsRunId())
|
|
self.dbclient.execute("delete from mshost where runid=%s" % self.getFakeMsRunId())
|
|
self.dbclient.execute("delete from cluster_details where name='resourceHAEnabled'")
|
|
self.dbclient.execute("delete from data_center_details where name='resourceHAEnabled'")
|
|
self.dbclient.execute("delete from ha_config where resource_type='Host'")
|
|
self.dbclient.execute("delete from oobm where port=%d" % self.getIpmiServerPort())
|
|
self.dbclient.execute("delete from mshost_peer where peer_runid=%s" % self.getFakeMsRunId())
|
|
self.dbclient.execute("delete from mshost where runid=%s" % self.getFakeMsRunId())
|
|
self.dbclient.execute("delete from cluster_details where name='outOfBandManagementEnabled'")
|
|
self.dbclient.execute("delete from data_center_details where name='outOfBandManagementEnabled'")
|
|
cleanup_resources(self.apiClient, self.cleanup)
|
|
if self.server:
|
|
self.server.shutdown()
|
|
self.server.server_close()
|
|
except Exception as e:
|
|
raise Exception("Warning: Exception during cleanup : %s" % e)
|
|
|
|
def getFakeMsId(self):
|
|
return self.fakeMsId
|
|
|
|
def getFakeMsRunId(self):
|
|
return self.fakeMsId * 1000
|
|
|
|
def getHostHaConfigCmd(self, provider='kvmhaprovider'):
|
|
cmd = configureHAForHost.configureHAForHostCmd()
|
|
cmd.provider = provider
|
|
cmd.hostid = self.host.id
|
|
return cmd
|
|
|
|
def getHostHaEnableCmd(self):
|
|
cmd = enableHAForHost.enableHAForHostCmd()
|
|
cmd.hostid = self.host.id
|
|
return cmd
|
|
|
|
def getHost(self, hostId=None):
|
|
response = list_hosts(
|
|
self.apiClient,
|
|
zoneid=self.zone.id,
|
|
type='Routing',
|
|
id=hostId
|
|
)
|
|
if len(response) > 0:
|
|
self.host = response[0]
|
|
return self.host
|
|
raise self.skipTest("No hosts found, skipping out-of-band management test")
|
|
|
|
def getIpmiServerIp(self):
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
s.connect((self.mgtSvrDetails["mgtSvrIp"], self.mgtSvrDetails["port"]))
|
|
return s.getsockname()[0]
|
|
|
|
def getIpmiServerPort(self):
|
|
return self.serverPort
|
|
|
|
def getOobmConfigCmd(self):
|
|
cmd = configureOutOfBandManagement.configureOutOfBandManagementCmd()
|
|
cmd.driver = 'ipmitool' # The default available driver
|
|
cmd.address = self.getIpmiServerIp()
|
|
cmd.port = self.getIpmiServerPort()
|
|
cmd.username = 'admin'
|
|
cmd.password = 'password'
|
|
cmd.hostid = self.host.id
|
|
return cmd
|
|
|
|
def getOobmEnableCmd(self):
|
|
cmd = enableOutOfBandManagementForHost.enableOutOfBandManagementForHostCmd()
|
|
cmd.hostid = self.host.id
|
|
return cmd
|
|
|
|
def getOobmDisableCmd(self):
|
|
cmd = disableOutOfBandManagementForHost.disableOutOfBandManagementForHostCmd()
|
|
cmd.hostid = self.host.id
|
|
return cmd
|
|
|
|
def getOobmIssueActionCmd(self):
|
|
cmd = issueOutOfBandManagementPowerAction.issueOutOfBandManagementPowerActionCmd()
|
|
cmd.hostid = self.host.id
|
|
cmd.action = 'STATUS'
|
|
return cmd
|
|
|
|
def issue_power_action_cmd(self, action, timeout=None):
|
|
cmd = self.getOobmIssueActionCmd()
|
|
cmd.action = action
|
|
if timeout:
|
|
cmd.timeout = timeout
|
|
|
|
try:
|
|
return self.apiClient.issueOutOfBandManagementPowerAction(cmd)
|
|
except Exception as e:
|
|
if "packet session id 0x0 does not match active session" in str(e):
|
|
raise self.skipTest("Known ipmitool issue hit, skipping test")
|
|
raise e
|
|
|
|
def configure_and_enable_oobm(self):
|
|
self.apiClient.configureOutOfBandManagement(self.getOobmConfigCmd())
|
|
response = self.apiClient.enableOutOfBandManagementForHost(self.getOobmEnableCmd())
|
|
self.assertEqual(response.enabled, True)
|
|
|
|
def start_ipmi_server(self):
|
|
def startIpmiServer(tname, server):
|
|
self.debug("Starting ipmisim server")
|
|
try:
|
|
server.serve_forever()
|
|
except Exception: pass
|
|
IpmiServerContext('reset')
|
|
ThreadedIpmiServer.allow_reuse_address = False
|
|
server = ThreadedIpmiServer(('0.0.0.0', self.getIpmiServerPort()), IpmiServer)
|
|
thread.start_new_thread(startIpmiServer, ("ipmi-server", server,))
|
|
self.server = server
|
|
|
|
def checkSyncToState(self, state, interval):
|
|
def checkForStateSync(expectedState):
|
|
response = self.getHost(hostId=self.host.id).outofbandmanagement
|
|
return response.powerstate == expectedState, None
|
|
|
|
sync_interval = 1 + int(interval)/1000
|
|
res, _ = wait_until(sync_interval, 10, checkForStateSync, state)
|
|
if not res:
|
|
self.fail("Failed to get host.powerstate synced to expected state:" + state)
|
|
response = self.getHost(hostId=self.host.id).outofbandmanagement
|
|
self.assertEqual(response.powerstate, state)
|
|
|
|
def get_host_in_available_state(self):
|
|
|
|
self.configure_and_start_ipmi_server()
|
|
self.assert_issue_command_state('ON', 'On')
|
|
self.configureAndEnableHostHa()
|
|
|
|
self.check_host_transition_to_available()
|
|
|
|
response = self.getHost()
|
|
if response.hostha.hastate is not "Available":
|
|
print response
|
|
|
|
self.assertEqual(response.hostha.hastate, "Available")
|
|
|
|
def configureAndEnableHostHa(self):
|
|
self.apiClient.configureHAForHost(self.getHostHaConfigCmd())
|
|
|
|
response = self.apiClient.enableHAForHost(self.getHostHaEnableCmd())
|
|
self.assertEqual(response.haenable, True)
|
|
|
|
def configure_and_start_ipmi_server(self, power_state=None):
|
|
"""
|
|
Setup ipmisim and enable out-of-band management for host
|
|
"""
|
|
self.configure_and_enable_oobm()
|
|
self.start_ipmi_server()
|
|
if power_state:
|
|
bmc = IpmiServerContext().bmc
|
|
bmc.powerstate = power_state
|
|
|
|
def assert_issue_command_state(self, command, expected):
|
|
"""
|
|
Asserts power action result for a given power command
|
|
"""
|
|
if command != 'STATUS':
|
|
self.issue_power_action_cmd(command)
|
|
response = self.issue_power_action_cmd('STATUS')
|
|
self.assertEqual(response.powerstate, expected)
|
|
|
|
def kill_agent(self):
|
|
t_end = time.time() + 90
|
|
while time.time() < t_end:
|
|
try:
|
|
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"],
|
|
passwd=self.hostConfig["password"]).execute \
|
|
("kill $(ps aux | grep 'cloudstack-agent' | awk '{print $2}')")
|
|
return
|
|
except Exception:
|
|
print("Cannot ssh into: " + self.host.ipaddress)
|
|
self.fail(self)
|
|
|
|
def set_host_to_alert(self):
|
|
self.dbclient.execute("update host set host.status = 'Alert' where host.uuid = '%s'" % self.host.id)
|
|
|
|
def check_host_transitioned_to_degraded(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Degraded":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def wait_util_host_is_fencing(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Fencing":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def check_host_transitioned_to_suspect(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Suspect":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def check_host_transitioned_to_checking(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Checking":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def wait_util_host_is_fenced(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Fenced":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def wait_util_host_is_up(self):
|
|
t_end = time.time() + 120
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.state in "Up":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def stop_agent(self):
|
|
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"], passwd=self.hostConfig["password"]).execute\
|
|
("service cloudstack-agent stop")
|
|
|
|
def start_agent(self):
|
|
self.ssh_and_restart_agent()
|
|
self.check_host_transition_to_available()
|
|
|
|
def ssh_and_restart_agent(self):
|
|
t_end = time.time() + 90
|
|
while time.time() < t_end:
|
|
try:
|
|
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"],
|
|
passwd=self.hostConfig["password"]).execute \
|
|
("service cloudstack-agent restart")
|
|
return
|
|
except Exception:
|
|
print("Cannot ssh into: " + self.host.ipaddress)
|
|
self.fail(self)
|
|
|
|
def check_host_transition_to_available(self):
|
|
t_end = time.time() + 90
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate == "Available":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def wait_util_host_is_recovered(self):
|
|
t_end = time.time() + 180
|
|
while time.time() < t_end:
|
|
host = self.getHost()
|
|
if host.hostha.hastate in "Recovered":
|
|
return
|
|
else:
|
|
continue
|
|
self.fail(self)
|
|
|
|
def reset_host(self):
|
|
SshClient(self.host.ipaddress, port=22, user=self.hostConfig["username"],
|
|
passwd=self.hostConfig["password"]).execute \
|
|
("reboot")
|
|
|
|
def deploy_vm(self):
|
|
vm = VirtualMachine.create(
|
|
self.apiClient,
|
|
services=self.services["virtual_machine"],
|
|
serviceofferingid=self.service_offering.id,
|
|
templateid=self.template.id,
|
|
zoneid=self.zone.id,
|
|
hostid = self.host.id,
|
|
method="POST"
|
|
)
|
|
|
|
self.cleanup.append(vm)
|
|
|
|
def update_configuration(self, name, value):
|
|
update_configuration_cmd = updateConfiguration.updateConfigurationCmd()
|
|
update_configuration_cmd.name = name
|
|
update_configuration_cmd.value = value
|
|
update_configuration_cmd.clusterid = self.cluster_id
|
|
|
|
self.apiClient.updateConfiguration(update_configuration_cmd)
|
|
|
|
|
|
@attr(tags = ["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
|
|
def test_ha_stop_agent_host_is_degraded(self):
|
|
"""
|
|
Tests HA state turns Degraded when agent is stopped
|
|
"""
|
|
self.deploy_vm()
|
|
|
|
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
|
|
self.get_host_in_available_state()
|
|
|
|
# SSH into the KVM Host and executes kill -9 of the agent
|
|
self.stop_agent()
|
|
|
|
# Checks if the host would turn into Degraded in the next 120 seconds
|
|
try:
|
|
self.check_host_transitioned_to_degraded()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Enable Host
|
|
self.start_agent()
|
|
|
|
#@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
|
|
def test_ha_recovering_start_agent_host_is_available(self):
|
|
"""
|
|
Tests HA state turns Recovered when agent is stopped and host is reset
|
|
"""
|
|
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
|
|
# Then kills the agent and wait untill the state is Degraded
|
|
|
|
self.deploy_vm()
|
|
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
|
|
self.get_host_in_available_state()
|
|
|
|
# SSH into the KVM Host and executes kill -9 of the agent
|
|
self.kill_agent()
|
|
|
|
# Checks if the host would turn into Degraded in the next 120 seconds
|
|
try:
|
|
self.check_host_transitioned_to_degraded()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Reset host so a shut down could be emulated. During the bootup host should transition into recovered state
|
|
self.reset_host()
|
|
|
|
# Waits until Degraded host turns into Recovered for 180 seconds,
|
|
# if it fails it tries to revert host back to Available
|
|
try:
|
|
self.wait_util_host_is_recovered()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# SSH into the KVM Host and executes service cloudstack-agent restart of the agent
|
|
self.start_agent()
|
|
|
|
#@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
|
|
def test_ha_fencing_host(self):
|
|
"""
|
|
Tests HA state turns Recovered when agent is stopped and host is reset,
|
|
then configure incorrect OOBM configuration, so that Recover command would fail
|
|
and host would transition into Fenced state.
|
|
"""
|
|
self.deploy_vm()
|
|
|
|
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
|
|
self.get_host_in_available_state()
|
|
|
|
# SSH into the KVM Host and executes kill -9 of the agent
|
|
self.kill_agent()
|
|
|
|
# Checks if the host would turn into Degraded in the next 120 seconds
|
|
try:
|
|
self.check_host_transitioned_to_degraded()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Change OOBM Configuration to invalid so it would fail the recover operations.
|
|
cmd = self.getOobmConfigCmd()
|
|
cmd.address = "1.1.1.1"
|
|
self.apiClient.configureOutOfBandManagement(cmd)
|
|
|
|
# Reset host so a shut down could be emulated. During the bootup host should transition into recovered state
|
|
self.reset_host()
|
|
self.kill_agent()
|
|
|
|
# Waits until Recovering host turns into Fencing for 180 seconds,
|
|
# if it fails it tries to revert host back to Up
|
|
try:
|
|
self.wait_util_host_is_fencing()
|
|
except Exception as e:
|
|
self.ssh_and_restart_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Configure correct OOBM configuration so that the Fencing operation would succeed
|
|
self.apiClient.configureOutOfBandManagement(self.getOobmConfigCmd())
|
|
|
|
# Waits until Fencing host turns into Fenced for 180 seconds,
|
|
# if it fails it tries to revert host back to Up
|
|
try:
|
|
self.wait_util_host_is_fenced()
|
|
except Exception as e:
|
|
self.ssh_and_restart_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# SSH into the KVM Host and executes service cloudstack-agent restart of the agent
|
|
self.ssh_and_restart_agent()
|
|
|
|
# Waits until state is Up so that cleanup would be successful
|
|
self.wait_util_host_is_up()
|
|
|
|
@attr(tags=["devcloud", "advanced", "advancedns", "smoke", "basic", "sg"], required_hardware="true")
|
|
def test_ha_kill_agent_host_is_degraded(self):
|
|
"""
|
|
Tests HA state turns Suspect/Checking when some activity/health checks fail
|
|
Configures HA, Logs into to a host and restarts the service
|
|
Then it confirms the ha state jumps through Suspect -> Checking -> Available
|
|
"""
|
|
# Configure and Enable OOBM, Set HA Provider and Enable HA. At the end checks if HA State is Available
|
|
self.get_host_in_available_state()
|
|
|
|
# SSH into the KVM Host and executes kill -9 of the agent
|
|
self.ssh_and_restart_agent()
|
|
|
|
# Checks if the host would turn into Suspect in the next 120 seconds
|
|
try:
|
|
self.check_host_transitioned_to_suspect()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Checks if the host would turn into Degraded in the next 120 seconds
|
|
try:
|
|
self.check_host_transitioned_to_checking()
|
|
except Exception as e:
|
|
self.start_agent()
|
|
raise Exception("Warning: Exception during test execution : %s" % e)
|
|
|
|
# Enable Host
|
|
self.check_host_transition_to_available()
|