From 748603f62d4e25f9d0a2cd703f160aaec1bb7372 Mon Sep 17 00:00:00 2001 From: frank Date: Tue, 24 Jan 2012 15:08:43 -0800 Subject: [PATCH] Bug 13269 - vmware - host put in maintenance mode> cancel maintenance mode> host remains in Connecting state we use 'update count' to make sure agent status transformation is atomic. However, atomic means success or fail which is not true for agent status. some important transformation occassionally fails because race condition that some other one is changing it simultaneously which finally makes agent stuck in a wrong status. use reenterent lock to serialize the agent status transformation. this memory lock works in clusterd environement as well because in our design an agent is only active in one mgmt server status 13269: resolved fixed --- .../cloud/agent/manager/AgentManagerImpl.java | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/server/src/com/cloud/agent/manager/AgentManagerImpl.java b/server/src/com/cloud/agent/manager/AgentManagerImpl.java index df6a8b00d6b..c9198f87704 100755 --- a/server/src/com/cloud/agent/manager/AgentManagerImpl.java +++ b/server/src/com/cloud/agent/manager/AgentManagerImpl.java @@ -31,6 +31,8 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; import javax.ejb.Local; import javax.naming.ConfigurationException; @@ -147,6 +149,7 @@ public class AgentManagerImpl implements AgentManager, HandlerFactory, Manager { protected List> _creationMonitors = new ArrayList>(17); protected List _loadingAgents = new ArrayList(); protected int _monitorId = 0; + private Lock _agentStatusLock = new ReentrantLock(); protected NioServer _connection; @Inject @@ -1393,22 +1396,29 @@ public class AgentManagerImpl implements AgentManager, HandlerFactory, Manager { @Override public boolean agentStatusTransitTo(HostVO host, Status.Event e, long msId) { - if (status_logger.isDebugEnabled()) { - ResourceState state = host.getResourceState(); - StringBuilder msg = new StringBuilder("Transition:"); - msg.append("[Resource state = ").append(state); - msg.append(", Agent event = ").append(e.toString()); - msg.append(", Host id = ").append(host.getId()).append(", name = " + host.getName()).append("]"); - status_logger.debug(msg); - } - - host.setManagementServerId(msId); - try { - return _statusStateMachine.transitTo(host, e, host.getId(), _hostDao); - } catch (NoTransitionException e1) { - status_logger.debug("Cannot transit agent status with event " + e + " for host " + host.getId() + ", name=" + host.getName()+ ", mangement server id is " + msId); - throw new CloudRuntimeException("Cannot transit agent status with event " + e + " for host " + host.getId() + ", mangement server id is " + msId + "," + e1.getMessage()); - } + try { + _agentStatusLock.lock(); + if (status_logger.isDebugEnabled()) { + ResourceState state = host.getResourceState(); + StringBuilder msg = new StringBuilder("Transition:"); + msg.append("[Resource state = ").append(state); + msg.append(", Agent event = ").append(e.toString()); + msg.append(", Host id = ").append(host.getId()).append(", name = " + host.getName()).append("]"); + status_logger.debug(msg); + } + + host.setManagementServerId(msId); + try { + return _statusStateMachine.transitTo(host, e, host.getId(), _hostDao); + } catch (NoTransitionException e1) { + status_logger.debug("Cannot transit agent status with event " + e + " for host " + host.getId() + ", name=" + host.getName() + + ", mangement server id is " + msId); + throw new CloudRuntimeException("Cannot transit agent status with event " + e + " for host " + host.getId() + ", mangement server id is " + + msId + "," + e1.getMessage()); + } + } finally { + _agentStatusLock.unlock(); + } } public boolean disconnectAgent(HostVO host, Status.Event e, long msId) {