mirror of https://github.com/apache/cloudstack.git
propagate lock table fixes
This commit is contained in:
parent
15bf729927
commit
44ce9488a6
|
|
@ -1,201 +0,0 @@
|
|||
/**
|
||||
* Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
|
||||
*
|
||||
* This software is licensed under the GNU General Public License v3 or later.
|
||||
*
|
||||
* It is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or any later version.
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
*/
|
||||
package com.cloud.agent.manager;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import com.cloud.agent.Listener;
|
||||
import com.cloud.agent.api.AgentControlAnswer;
|
||||
import com.cloud.agent.api.AgentControlCommand;
|
||||
import com.cloud.agent.api.Answer;
|
||||
import com.cloud.agent.api.Command;
|
||||
import com.cloud.agent.api.PingCommand;
|
||||
import com.cloud.agent.api.StartupCommand;
|
||||
import com.cloud.alert.AlertManager;
|
||||
import com.cloud.dc.DataCenterVO;
|
||||
import com.cloud.dc.HostPodVO;
|
||||
import com.cloud.dc.dao.DataCenterDao;
|
||||
import com.cloud.dc.dao.HostPodDao;
|
||||
import com.cloud.host.Host;
|
||||
import com.cloud.host.HostVO;
|
||||
import com.cloud.host.Status;
|
||||
import com.cloud.host.Status.Event;
|
||||
import com.cloud.host.dao.HostDao;
|
||||
import com.cloud.utils.db.GlobalLock;
|
||||
import com.cloud.vm.VMInstanceVO;
|
||||
import com.cloud.vm.dao.VMInstanceDao;
|
||||
|
||||
public class AgentMonitor extends Thread implements Listener {
|
||||
private static Logger s_logger = Logger.getLogger(AgentMonitor.class);
|
||||
private final long _pingTimeout;
|
||||
private final HostDao _hostDao;
|
||||
private boolean _stop;
|
||||
private final AgentManagerImpl _agentMgr;
|
||||
private final VMInstanceDao _vmDao;
|
||||
private DataCenterDao _dcDao = null;
|
||||
private HostPodDao _podDao = null;
|
||||
private final AlertManager _alertMgr;
|
||||
private final long _msId;
|
||||
|
||||
public AgentMonitor(long msId, HostDao hostDao, VMInstanceDao vmDao, DataCenterDao dcDao, HostPodDao podDao, AgentManagerImpl agentMgr, AlertManager alertMgr, long pingTimeout) {
|
||||
super("AgentMonitor");
|
||||
_msId = msId;
|
||||
_pingTimeout = pingTimeout;
|
||||
_hostDao = hostDao;
|
||||
_agentMgr = agentMgr;
|
||||
_stop = false;
|
||||
_vmDao = vmDao;
|
||||
_dcDao = dcDao;
|
||||
_podDao = podDao;
|
||||
_alertMgr = alertMgr;
|
||||
}
|
||||
|
||||
// TODO : use host machine time is not safe in clustering environment
|
||||
@Override
|
||||
public void run() {
|
||||
s_logger.info("Agent Monitor is started.");
|
||||
|
||||
while (!_stop) {
|
||||
try {
|
||||
// check every 60 seconds
|
||||
Thread.sleep(60 * 1000);
|
||||
} catch (InterruptedException e) {
|
||||
s_logger.info("Who woke me from my slumber?");
|
||||
}
|
||||
|
||||
GlobalLock lock = GlobalLock.getInternLock("AgentMonitorLock");
|
||||
if (lock == null) {
|
||||
s_logger.error("Unable to acquire lock. Better luck next time?");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!lock.lock(10)) {
|
||||
s_logger.info("Someone else is already working on the agents. Skipping my turn");
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
long time = (System.currentTimeMillis() >> 10) - _pingTimeout;
|
||||
List<HostVO> hosts = _hostDao.findLostHosts(time);
|
||||
if (s_logger.isInfoEnabled()) {
|
||||
s_logger.info("Found " + hosts.size() + " hosts behind on ping. pingTimeout : " + _pingTimeout + ", mark time : " + time);
|
||||
}
|
||||
|
||||
for (HostVO host : hosts) {
|
||||
if (host.getType().equals(Host.Type.ExternalFirewall) ||
|
||||
host.getType().equals(Host.Type.ExternalLoadBalancer) ||
|
||||
host.getType().equals(Host.Type.TrafficMonitor) ||
|
||||
host.getType().equals(Host.Type.SecondaryStorage)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (host.getManagementServerId() == null || host.getManagementServerId() == _msId) {
|
||||
if (s_logger.isInfoEnabled()) {
|
||||
s_logger.info("Asking agent mgr to investgate why host " + host.getId() + " is behind on ping. last ping time: " + host.getLastPinged());
|
||||
}
|
||||
_agentMgr.disconnect(host.getId(), Event.PingTimeout, true);
|
||||
}
|
||||
}
|
||||
|
||||
hosts = _hostDao.listByStatus(Status.PrepareForMaintenance, Status.ErrorInMaintenance);
|
||||
for (HostVO host : hosts) {
|
||||
long hostId = host.getId();
|
||||
DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
|
||||
HostPodVO podVO = _podDao.findById(host.getPodId());
|
||||
String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
|
||||
|
||||
if (host.getType() != Host.Type.Storage) {
|
||||
List<VMInstanceVO> vos = _vmDao.listByHostId(host.getId());
|
||||
if (vos.size() == 0) {
|
||||
_alertMgr.sendAlert(AlertManager.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Migration Complete for host " + hostDesc, "Host [" + hostDesc + "] is ready for maintenance");
|
||||
_hostDao.updateStatus(host, Event.PreparationComplete, _msId);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Throwable th) {
|
||||
s_logger.error("Caught the following exception: ", th);
|
||||
} finally {
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
s_logger.info("Agent Monitor is leaving the building!");
|
||||
}
|
||||
|
||||
public void signalStop() {
|
||||
_stop = true;
|
||||
interrupt();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isRecurring() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean processAnswers(long agentId, long seq, Answer[] answers) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean processCommands(long agentId, long seq, Command[] commands) {
|
||||
boolean processed = false;
|
||||
for (Command cmd : commands) {
|
||||
if (cmd instanceof PingCommand) {
|
||||
HostVO host = _hostDao.findById(agentId);
|
||||
if( host == null ) {
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Cant not find host " + agentId);
|
||||
}
|
||||
} else {
|
||||
_hostDao.updateStatus(host, Event.Ping, _msId);
|
||||
}
|
||||
processed = true;
|
||||
}
|
||||
}
|
||||
return processed;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AgentControlAnswer processControlCommand(long agentId, AgentControlCommand cmd) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processConnect(HostVO host, StartupCommand cmd, boolean forRebalance) {
|
||||
s_logger.debug("Registering agent monitor for " + host.getId());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean processDisconnect(long agentId, Status state) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean processTimeout(long agentId, long seq) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTimeout() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -477,14 +477,14 @@ public class ClusterManagerImpl implements ClusterManager {
|
|||
}
|
||||
|
||||
public void notifyNodeJoined(List<ManagementServerHostVO> nodeList) {
|
||||
if(s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Notify management server node join to listeners.");
|
||||
|
||||
for(ManagementServerHostVO mshost : nodeList) {
|
||||
s_logger.debug("Joining node, IP: " + mshost.getServiceIP() + ", msid: " + mshost.getMsid());
|
||||
}
|
||||
}
|
||||
|
||||
if(s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Notify management server node join to listeners.");
|
||||
|
||||
for(ManagementServerHostVO mshost : nodeList) {
|
||||
s_logger.debug("Joining node, IP: " + mshost.getServiceIP() + ", msid: " + mshost.getMsid());
|
||||
}
|
||||
}
|
||||
|
||||
synchronized(listeners) {
|
||||
for(ClusterManagerListener listener : listeners) {
|
||||
listener.onManagementNodeJoined(nodeList, _mshostId);
|
||||
|
|
@ -496,14 +496,14 @@ public class ClusterManagerImpl implements ClusterManager {
|
|||
}
|
||||
|
||||
public void notifyNodeLeft(List<ManagementServerHostVO> nodeList) {
|
||||
if(s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Notify management server node left to listeners.");
|
||||
|
||||
for(ManagementServerHostVO mshost : nodeList) {
|
||||
s_logger.debug("Leaving node, IP: " + mshost.getServiceIP() + ", msid: " + mshost.getMsid());
|
||||
}
|
||||
}
|
||||
|
||||
if(s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Notify management server node left to listeners.");
|
||||
|
||||
for(ManagementServerHostVO mshost : nodeList) {
|
||||
s_logger.debug("Leaving node, IP: " + mshost.getServiceIP() + ", msid: " + mshost.getMsid());
|
||||
}
|
||||
}
|
||||
|
||||
synchronized(listeners) {
|
||||
for(ClusterManagerListener listener : listeners) {
|
||||
listener.onManagementNodeLeft(nodeList, _mshostId);
|
||||
|
|
@ -515,9 +515,9 @@ public class ClusterManagerImpl implements ClusterManager {
|
|||
}
|
||||
|
||||
public void notifyNodeIsolated() {
|
||||
if(s_logger.isDebugEnabled())
|
||||
s_logger.debug("Notify management server node isolation to listeners");
|
||||
|
||||
if(s_logger.isDebugEnabled())
|
||||
s_logger.debug("Notify management server node isolation to listeners");
|
||||
|
||||
synchronized(listeners) {
|
||||
for(ClusterManagerListener listener : listeners) {
|
||||
listener.onManagementNodeIsolated();
|
||||
|
|
@ -673,7 +673,7 @@ public class ClusterManagerImpl implements ClusterManager {
|
|||
private Connection getHeartbeatConnection() throws SQLException {
|
||||
if(_heartbeatConnection == null) {
|
||||
Connection conn = Transaction.getStandaloneConnectionWithException();
|
||||
_heartbeatConnection = new ConnectionConcierge("ClusterManagerHeartBeat", conn, false, false);
|
||||
_heartbeatConnection = new ConnectionConcierge("ClusterManagerHeartBeat", conn, false);
|
||||
}
|
||||
|
||||
return _heartbeatConnection.conn();
|
||||
|
|
@ -955,11 +955,10 @@ public class ClusterManagerImpl implements ClusterManager {
|
|||
if (s_logger.isInfoEnabled()) {
|
||||
s_logger.info("Management server (host id : " + _mshostId + ") is being started at " + _clusterNodeIP + ":" + _currentServiceAdapter.getServicePort());
|
||||
}
|
||||
|
||||
|
||||
// use seperate thread for heartbeat updates
|
||||
_heartbeatScheduler.scheduleAtFixedRate(getHeartbeatTask(), heartbeatInterval, heartbeatInterval, TimeUnit.MILLISECONDS);
|
||||
_notificationExecutor.submit(getNotificationTask());
|
||||
|
||||
|
||||
} catch (Throwable e) {
|
||||
s_logger.error("Unexpected exception : ", e);
|
||||
|
|
@ -1079,8 +1078,8 @@ public class ClusterManagerImpl implements ClusterManager {
|
|||
if(_currentServiceAdapter == null) {
|
||||
throw new ConfigurationException("Unable to set current cluster service adapter");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
_agentLBEnabled = Boolean.valueOf(configDao.getValue(Config.AgentLbEnable.key()));
|
||||
|
||||
String connectedAgentsThreshold = configs.get("agent.load.threshold");
|
||||
|
|
@ -1211,7 +1210,7 @@ public class ClusterManagerImpl implements ClusterManager {
|
|||
public boolean rebalanceAgent(long agentId, Event event, long currentOwnerId, long futureOwnerId) throws AgentUnavailableException, OperationTimedoutException {
|
||||
return _rebalanceService.executeRebalanceRequest(agentId, currentOwnerId, futureOwnerId, event);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isAgentRebalanceEnabled() {
|
||||
return _agentLBEnabled;
|
||||
|
|
|
|||
|
|
@ -48,19 +48,19 @@ import com.cloud.utils.mgmt.JmxUtil;
|
|||
* your own.
|
||||
*/
|
||||
public class ConnectionConcierge {
|
||||
|
||||
|
||||
static final Logger s_logger = Logger.getLogger(ConnectionConcierge.class);
|
||||
|
||||
|
||||
static final ConnectionConciergeManager s_mgr = new ConnectionConciergeManager();
|
||||
|
||||
|
||||
Connection _conn;
|
||||
String _name;
|
||||
boolean _keepAlive;
|
||||
boolean _autoCommit;
|
||||
int _isolationLevel;
|
||||
int _holdability;
|
||||
|
||||
public ConnectionConcierge(String name, Connection conn, boolean autoCommit, boolean keepAlive) {
|
||||
|
||||
public ConnectionConcierge(String name, Connection conn, boolean keepAlive) {
|
||||
_name = name + s_mgr.getNextId();
|
||||
_keepAlive = keepAlive;
|
||||
try {
|
||||
|
|
@ -72,7 +72,7 @@ public class ConnectionConcierge {
|
|||
}
|
||||
reset(conn);
|
||||
}
|
||||
|
||||
|
||||
public void reset(Connection conn) {
|
||||
try {
|
||||
release();
|
||||
|
|
@ -90,11 +90,11 @@ public class ConnectionConcierge {
|
|||
s_mgr.register(_name, this);
|
||||
s_logger.debug("Registering a database connection for " + _name);
|
||||
}
|
||||
|
||||
|
||||
public final Connection conn() {
|
||||
return _conn;
|
||||
}
|
||||
|
||||
|
||||
public void release() {
|
||||
s_mgr.unregister(_name);
|
||||
try {
|
||||
|
|
@ -106,23 +106,23 @@ public class ConnectionConcierge {
|
|||
throw new CloudRuntimeException("Problem in closing a connection", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected void finalize() throws Exception {
|
||||
if (_conn != null) {
|
||||
release();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean keepAlive() {
|
||||
return _keepAlive;
|
||||
}
|
||||
|
||||
|
||||
protected static class ConnectionConciergeManager extends StandardMBean implements ConnectionConciergeMBean {
|
||||
ScheduledExecutorService _executor = Executors.newScheduledThreadPool(1, new NamedThreadFactory("ConnectionKeeper"));
|
||||
final ConcurrentHashMap<String, ConnectionConcierge> _conns = new ConcurrentHashMap<String, ConnectionConcierge>();
|
||||
final AtomicInteger _idGenerator = new AtomicInteger();
|
||||
|
||||
|
||||
ConnectionConciergeManager() {
|
||||
super(ConnectionConciergeMBean.class, false);
|
||||
resetKeepAliveTask(20);
|
||||
|
|
@ -132,19 +132,19 @@ public class ConnectionConcierge {
|
|||
s_logger.error("Unable to register mbean", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Integer getNextId() {
|
||||
return _idGenerator.incrementAndGet();
|
||||
}
|
||||
|
||||
|
||||
public void register(String name, ConnectionConcierge concierge) {
|
||||
_conns.put(name, concierge);
|
||||
}
|
||||
|
||||
|
||||
public void unregister(String name) {
|
||||
_conns.remove(name);
|
||||
}
|
||||
|
||||
|
||||
protected String testValidity(String name, Connection conn) {
|
||||
PreparedStatement pstmt = null;
|
||||
try {
|
||||
|
|
@ -182,12 +182,12 @@ public class ConnectionConcierge {
|
|||
if (concierge == null) {
|
||||
return "Not Found";
|
||||
}
|
||||
|
||||
|
||||
Connection conn = Transaction.getStandaloneConnection();
|
||||
if (conn == null) {
|
||||
return "Unable to get anotehr db connection";
|
||||
}
|
||||
|
||||
|
||||
concierge.reset(conn);
|
||||
return "Done";
|
||||
}
|
||||
|
|
@ -201,7 +201,7 @@ public class ConnectionConcierge {
|
|||
s_logger.error("Unable to shutdown executor", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
_executor = Executors.newScheduledThreadPool(1, new NamedThreadFactory("ConnectionConcierge"));
|
||||
_executor.schedule(new Runnable() {
|
||||
@Override
|
||||
|
|
@ -215,7 +215,7 @@ public class ConnectionConcierge {
|
|||
}
|
||||
}
|
||||
}, seconds, TimeUnit.SECONDS);
|
||||
|
||||
|
||||
return "As you wish.";
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -66,7 +66,9 @@ public class Merovingian2 extends StandardMBean implements MerovingianMBean {
|
|||
Connection conn = null;
|
||||
try {
|
||||
conn = Transaction.getStandaloneConnectionWithException();
|
||||
_concierge = new ConnectionConcierge("LockMaster", conn, true, true);
|
||||
conn.setTransactionIsolation(Connection.TRANSACTION_READ_COMMITTED);
|
||||
conn.setAutoCommit(true);
|
||||
_concierge = new ConnectionConcierge("LockMaster", conn, false);
|
||||
} catch (SQLException e) {
|
||||
s_logger.error("Unable to get a new db connection", e);
|
||||
throw new CloudRuntimeException("Unable to initialize a connection to the database for locking purposes: ", e);
|
||||
|
|
|
|||
Loading…
Reference in New Issue