Bug 9446: Investigator reports that a system vm is down even if it isn't....

Changes:
- Added new Investigator 'ManagementIPSystemVMInvestigator' that checks if Vm is alive only for System VM's that have a management IP address.
- If no management IP is found, ping test cannot be done, so this investigator would return null in that case.
- Current implementation InvestigatorImpl is renamed as 'UserVmDomRInvestigator' and does the ping test for user VMs only.
- Corrected the ping test code that was checking a hard-coded string. Now if the ping answer is negative, we just return null
- Added the new investigator to components.xml
This commit is contained in:
prachi 2011-04-28 12:28:22 -07:00
parent 5198709942
commit c2824edc03
4 changed files with 255 additions and 83 deletions

View File

@ -64,8 +64,9 @@
</adapters>
<adapters key="com.cloud.ha.Investigator">
<adapter name="SimpleInvestigator" class="com.cloud.ha.CheckOnAgentInvestigator"/>
<adapter name="PingInvestigator" class="com.cloud.ha.InvestigatorImpl"/>
<adapter name="PingInvestigator" class="com.cloud.ha.UserVmDomRInvestigator"/>
<adapter name="XenServerInvestigator" class="com.cloud.ha.XenServerInvestigator"/>
<adapter name="ManagementIPSysVMInvestigator" class="com.cloud.ha.ManagementIPSystemVMInvestigator"/>
</adapters>
<adapters key="com.cloud.ha.FenceBuilder">
<adapter name="XenServerFenceBuilder" class="com.cloud.ha.XenServerFencer"/>

View File

@ -0,0 +1,108 @@
/**
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
*
* This software is licensed under the GNU General Public License v3 or later.
*
* It is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package com.cloud.ha;
import java.util.List;
import java.util.Map;
import javax.naming.ConfigurationException;
import org.apache.log4j.Logger;
import com.cloud.agent.AgentManager;
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.PingTestCommand;
import com.cloud.exception.AgentUnavailableException;
import com.cloud.exception.OperationTimedoutException;
import com.cloud.host.Host.Type;
import com.cloud.host.Status;
import com.cloud.host.dao.HostDao;
import com.cloud.utils.component.Inject;
public abstract class AbstractInvestigatorImpl implements Investigator {
private static final Logger s_logger = Logger.getLogger(AbstractInvestigatorImpl.class);
private String _name = null;
@Inject private HostDao _hostDao = null;
@Inject private AgentManager _agentMgr = null;
@Override
public boolean configure(String name, Map<String, Object> params) throws ConfigurationException {
_name = name;
return true;
}
@Override
public String getName() {
return _name;
}
@Override
public boolean start() {
return true;
}
@Override
public boolean stop() {
return true;
}
// Host.status is up and Host.type is routing
protected List<Long> findHostByPod(long podId, Long excludeHostId) {
List<Long> hostIds = _hostDao.listBy(null, podId, null, Type.Routing, Status.Up);
if (excludeHostId != null){
hostIds.remove(excludeHostId);
}
return hostIds;
}
protected Status testIpAddress(Long hostId, String testHostIp) {
try {
Answer pingTestAnswer = _agentMgr.send(hostId, new PingTestCommand(testHostIp), 30 * 1000);
if(pingTestAnswer == null) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("host (" + testHostIp + ") returns null answer");
}
return null;
}
if (pingTestAnswer.getResult()) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("host (" + testHostIp + ") has been successfully pinged, returning that host is up");
}
// computing host is available, but could not reach agent, return false
return Status.Up;
} else {
if (s_logger.isDebugEnabled()) {
s_logger.debug("host (" + testHostIp + ") cannot be pinged, returning null ('I don't know')");
}
return null;
}
} catch (AgentUnavailableException e) {
return null;
} catch (OperationTimedoutException e) {
return null;
}
}
}

View File

@ -0,0 +1,129 @@
/**
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
*
* This software is licensed under the GNU General Public License v3 or later.
*
* It is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package com.cloud.ha;
import java.util.List;
import java.util.Map;
import javax.ejb.Local;
import javax.naming.ConfigurationException;
import org.apache.log4j.Logger;
import com.cloud.host.HostVO;
import com.cloud.host.Status;
import com.cloud.host.dao.HostDao;
import com.cloud.network.NetworkManager;
import com.cloud.network.Networks.TrafficType;
import com.cloud.utils.component.Inject;
import com.cloud.vm.Nic;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.VirtualMachine;
@Local(value={Investigator.class})
public class ManagementIPSystemVMInvestigator extends AbstractInvestigatorImpl {
private static final Logger s_logger = Logger.getLogger(ManagementIPSystemVMInvestigator.class);
private String _name = null;
@Inject private HostDao _hostDao = null;
@Inject private NetworkManager _networkMgr = null;
@Override
public Boolean isVmAlive(VMInstanceVO vm, HostVO host) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("testing if vm (" + vm.getId() + ") is alive");
}
if (VirtualMachine.Type.isSystemVM(vm.getType())) {
Nic nic = _networkMgr.getNicForTraffic(vm.getId(), TrafficType.Management);
if (nic == null) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Unable to find a management nic, cannot ping this system VM, unable to determine state of vm (" + vm.getId() + "), returning null");
}
return null;
}
// get the data center IP address, find a host on the pod, use that host to ping the data center IP address
HostVO vmHost = _hostDao.findById(vm.getHostId());
List<Long> otherHosts = findHostByPod(vm.getPodId(), vm.getHostId());
for (Long otherHost : otherHosts) {
Status vmState = testIpAddress(otherHost, vm.getPrivateIpAddress());
if (vmState == null) {
// can't get information from that host, try the next one
continue;
}
if (vmState == Status.Up) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("successfully pinged vm's private IP (" + vm.getPrivateIpAddress() + "), returning that the VM is up");
}
return Boolean.TRUE;
} else if (vmState == Status.Down) {
// We can't ping the VM directly...if we can ping the host, then report the VM down.
// If we can't ping the host, then we don't have enough information.
Status vmHostState = testIpAddress(otherHost, vmHost.getPrivateIpAddress());
if ((vmHostState != null) && (vmHostState == Status.Up)) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("successfully pinged vm's host IP (" + vmHost.getPrivateIpAddress() + "), but could not ping VM, returning that the VM is down");
}
return Boolean.FALSE;
}
}
}
}else{
if (s_logger.isDebugEnabled()) {
s_logger.debug("Not a System Vm, unable to determine state of vm (" + vm.getId() + "), returning null");
}
}
if (s_logger.isDebugEnabled()) {
s_logger.debug("unable to determine state of vm (" + vm.getId() + "), returning null");
}
return null;
}
@Override
public Status isAgentAlive(HostVO agent) {
return null;
}
@Override
public boolean configure(String name, Map<String, Object> params) throws ConfigurationException {
_name = name;
return true;
}
@Override
public String getName() {
return _name;
}
@Override
public boolean start() {
return true;
}
@Override
public boolean stop() {
return true;
}
}

View File

@ -18,6 +18,7 @@
package com.cloud.ha;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@ -31,10 +32,9 @@ import com.cloud.agent.api.Answer;
import com.cloud.agent.api.PingTestCommand;
import com.cloud.exception.AgentUnavailableException;
import com.cloud.exception.OperationTimedoutException;
import com.cloud.host.Host.Type;
import com.cloud.host.HostVO;
import com.cloud.host.Status;
import com.cloud.host.dao.HostDao;
import com.cloud.hypervisor.Hypervisor.HypervisorType;
import com.cloud.network.NetworkManager;
import com.cloud.network.Networks.TrafficType;
import com.cloud.network.router.VirtualNetworkApplianceManager;
@ -44,16 +44,13 @@ import com.cloud.vm.Nic;
import com.cloud.vm.UserVmVO;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.VirtualMachine;
import com.cloud.vm.dao.DomainRouterDao;
import com.cloud.vm.dao.UserVmDao;
@Local(value={Investigator.class})
public class InvestigatorImpl implements Investigator {
private static final Logger s_logger = Logger.getLogger(InvestigatorImpl.class);
public class UserVmDomRInvestigator extends AbstractInvestigatorImpl {
private static final Logger s_logger = Logger.getLogger(UserVmDomRInvestigator.class);
private String _name = null;
@Inject private HostDao _hostDao = null;
@Inject private DomainRouterDao _routerDao = null;;
@Inject private UserVmDao _userVmDao = null;
@Inject private AgentManager _agentMgr = null;
@Inject private NetworkManager _networkMgr = null;
@ -85,35 +82,11 @@ public class InvestigatorImpl implements Investigator {
}
return testUserVM(vm, nic, router);
} else if ((vm.getType() == VirtualMachine.Type.DomainRouter) || (vm.getType() == VirtualMachine.Type.ConsoleProxy)) {
// get the data center IP address, find a host on the pod, use that host to ping the data center IP address
HostVO vmHost = _hostDao.findById(vm.getHostId());
List<Long> otherHosts = findHostByPod(vm.getPodId(), vm.getHostId());
for (Long otherHost : otherHosts) {
Status vmState = testIpAddress(otherHost, vm.getPrivateIpAddress());
if (vmState == null) {
// can't get information from that host, try the next one
continue;
}
if (vmState == Status.Up) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("successfully pinged vm's private IP (" + vm.getPrivateIpAddress() + "), returning that the VM is up");
}
return Boolean.TRUE;
} else if (vmState == Status.Down) {
// We can't ping the VM directly...if we can ping the host, then report the VM down.
// If we can't ping the host, then we don't have enough information.
Status vmHostState = testIpAddress(otherHost, vmHost.getPrivateIpAddress());
if ((vmHostState != null) && (vmHostState == Status.Up)) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("successfully pinged vm's host IP (" + vmHost.getPrivateIpAddress() + "), but could not ping VM, returning that the VM is down");
}
return Boolean.FALSE;
}
}
}
}
}else{
if (s_logger.isDebugEnabled()) {
s_logger.debug("Not a User Vm, unable to determine state of vm (" + vm.getId() + "), returning null");
}
}
if (s_logger.isDebugEnabled()) {
s_logger.debug("unable to determine state of vm (" + vm.getId() + "), returning null");
}
@ -183,21 +156,17 @@ public class InvestigatorImpl implements Investigator {
return true;
}
// Host.status is up and Host.type is routing
private List<Long> findHostByPod(long podId, Long excludeHostId) {
List<Long> hostIds = _hostDao.listBy(null, podId, null, Type.Routing, Status.Up);
if (excludeHostId != null){
hostIds.remove(excludeHostId);
}
return hostIds;
}
private Boolean testUserVM(VMInstanceVO vm, Nic nic, VirtualRouter router) {
String privateIp = nic.getIp4Address();
String routerPrivateIp = router.getPrivateIpAddress();
List<Long> otherHosts = findHostByPod(router.getPodId(), null);
List<Long> otherHosts = new ArrayList<Long>();
if(vm.getHypervisorType() == HypervisorType.XenServer
|| vm.getHypervisorType() == HypervisorType.KVM){
otherHosts.add(router.getHostId());
}else{
otherHosts = findHostByPod(router.getPodId(), null);
}
for (Long hostId : otherHosts) {
try {
Answer pingTestAnswer = _agentMgr.send(hostId, new PingTestCommand(routerPrivateIp, privateIp), 30 * 1000);
@ -225,39 +194,4 @@ public class InvestigatorImpl implements Investigator {
return null;
}
private Status testIpAddress(Long hostId, String testHostIp) {
try {
Answer pingTestAnswer = _agentMgr.send(hostId, new PingTestCommand(testHostIp), 30 * 1000);
if(pingTestAnswer == null) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("host (" + testHostIp + ") returns null answer");
}
return null;
}
if (pingTestAnswer.getResult()) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("host (" + testHostIp + ") has been successfully pinged, returning that host is up");
}
// computing host is available, but could not reach agent, return false
return Status.Up;
} else {
if (pingTestAnswer.getDetails().startsWith("Unable to ping default route")) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("host (" + hostId + ") cannot ping default route, returning 'I don't know'");
}
return null;
}
if (s_logger.isDebugEnabled()) {
s_logger.debug("host (" + testHostIp + ") cannot be pinged, returning that host is down");
}
return Status.Down;
}
} catch (AgentUnavailableException e) {
return null;
} catch (OperationTimedoutException e) {
return null;
}
}
}