diff --git a/engine/orchestration/src/com/cloud/agent/manager/AgentManagerImpl.java b/engine/orchestration/src/com/cloud/agent/manager/AgentManagerImpl.java index 64aaf48ae87..a38fd0881c2 100644 --- a/engine/orchestration/src/com/cloud/agent/manager/AgentManagerImpl.java +++ b/engine/orchestration/src/com/cloud/agent/manager/AgentManagerImpl.java @@ -824,25 +824,30 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl /* OK, we are going to the bad status, let's see what happened */ s_logger.info("Investigating why host " + hostId + " has disconnected with event " + event); - final Status determinedState = investigate(attache); + Status determinedState = investigate(attache); // if state cannot be determined do nothing and bail out if (determinedState == null) { - s_logger.warn("Agent state cannot be determined, do nothing"); - return false; + if (((System.currentTimeMillis() >> 10) - host.getLastPinged()) > AlertWait.value()) { + s_logger.warn("Agent " + hostId + " state cannot be determined for more than " + AlertWait + "(" + AlertWait.value() + ") seconds, will go to Alert state"); + determinedState = Status.Alert; + } else { + s_logger.warn("Agent " + hostId + " state cannot be determined, do nothing"); + return false; + } } final Status currentStatus = host.getStatus(); - s_logger.info("The state determined is " + determinedState); + s_logger.info("The agent " + hostId + " state determined is " + determinedState); if (determinedState == Status.Down) { - s_logger.error("Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs"); + String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs"; + s_logger.error(message); if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) { - _alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host disconnected, " + host.getId(), - "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs"); + _alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message); } event = Status.Event.HostDown; } else if (determinedState == Status.Up) { - /* Got ping response from host, bring it back*/ + /* Got ping response from host, bring it back */ s_logger.info("Agent is determined to be up and running"); agentStatusTransitTo(host, Status.Event.Ping, _nodeId); return false; @@ -850,10 +855,10 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl s_logger.warn("Agent is disconnected but the host is still up: " + host.getId() + "-" + host.getName()); if (currentStatus == Status.Disconnected) { if ((System.currentTimeMillis() >> 10) - host.getLastPinged() > AlertWait.value()) { - s_logger.warn("Host " + host.getId() + " has been disconnected pass the time it should be disconnected."); + s_logger.warn("Host " + host.getId() + " has been disconnected past the wait time it should be disconnected."); event = Status.Event.WaitedTooLong; } else { - s_logger.debug("Host has been determined to be disconnected but it hasn't passed the wait time yet."); + s_logger.debug("Host " + host.getId() + " has been determined to be disconnected but it hasn't passed the wait time yet."); return false; } } else if (currentStatus == Status.Up) { @@ -862,7 +867,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) { _alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host disconnected, " + hostDesc, - "If the agent for host [" + hostDesc + "] is not restarted within " + AlertWait + " seconds, HA will begin on the VMs"); + "If the agent for host [" + hostDesc + "] is not restarted within " + AlertWait + " seconds, host will go to Alert state"); } event = Status.Event.AgentDisconnected; } @@ -872,11 +877,10 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl final HostPodVO podVO = _podDao.findById(host.getPodId()); final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); _alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host in ALERT state, " + hostDesc, - "In availability zone " + host.getDataCenterId() + ", " + host.getId() + "-" + host.getName() - + " disconnect due to event " + event + ", ms can't determine the host status" ); + "In availability zone " + host.getDataCenterId() + ", host is in alert state: " + host.getId() + "-" + host.getName()); } } else { - s_logger.debug("The next status of Agent " + host.getId() + " is not Alert, no need to investigate what happened"); + s_logger.debug("The next status of agent " + host.getId() + " is not Alert, no need to investigate what happened"); } } handleDisconnectWithoutInvestigation(attache, event, true, true); diff --git a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java index 1ecdfcdc17c..0a179a099e7 100644 --- a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -220,7 +220,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai } } - return Status.Alert; + return hostState; } @Override