diff --git a/server/src/com/cloud/cluster/ActiveFencingException.java b/server/src/com/cloud/cluster/ActiveFencingException.java
new file mode 100644
index 00000000000..65149d5e353
--- /dev/null
+++ b/server/src/com/cloud/cluster/ActiveFencingException.java
@@ -0,0 +1,31 @@
+/**
+ * Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
+ *
+ * This software is licensed under the GNU General Public License v3 or later.
+ *
+ * It is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ *
+ */
+
+package com.cloud.cluster;
+
+public class ActiveFencingException extends Exception {
+ private static final long serialVersionUID = -3975376101728211726L;
+
+ public ActiveFencingException(String message) {
+ super(message);
+ }
+
+ public ActiveFencingException(String message, Throwable th) {
+ super(message, th);
+ }
+}
diff --git a/server/src/com/cloud/cluster/ClusterManagerImpl.java b/server/src/com/cloud/cluster/ClusterManagerImpl.java
index 02aa52a078a..bb5bac3d060 100755
--- a/server/src/com/cloud/cluster/ClusterManagerImpl.java
+++ b/server/src/com/cloud/cluster/ClusterManagerImpl.java
@@ -51,8 +51,10 @@ import com.cloud.agent.api.ChangeAgentCommand;
import com.cloud.agent.api.Command;
import com.cloud.agent.api.PropagateResourceEventCommand;
import com.cloud.agent.manager.Commands;
+import com.cloud.cluster.ManagementServerHost.State;
import com.cloud.cluster.agentlb.dao.HostTransferMapDao;
import com.cloud.cluster.dao.ManagementServerHostDao;
+import com.cloud.cluster.dao.ManagementServerHostPeerDao;
import com.cloud.configuration.Config;
import com.cloud.configuration.dao.ConfigurationDao;
import com.cloud.exception.AgentUnavailableException;
@@ -119,6 +121,7 @@ public class ClusterManagerImpl implements ClusterManager {
private ClusterServiceAdapter _currentServiceAdapter;
private ManagementServerHostDao _mshostDao;
+ private ManagementServerHostPeerDao _mshostPeerDao;
private HostDao _hostDao;
private HostTransferMapDao _hostTransferDao;
@@ -683,6 +686,8 @@ public class ClusterManagerImpl implements ClusterManager {
}
invalidHeartbeatConnection();
+ } catch(ActiveFencingException e) {
+ queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
} catch (Throwable e) {
if(isRootCauseConnectionRelated(e.getCause())) {
s_logger.error("DB communication problem detected");
@@ -804,6 +809,34 @@ public class ClusterManagerImpl implements ClusterManager {
this._notificationMsgs.add(msg);
this._notificationMsgs.notifyAll();
}
+
+ switch(msg.getMessageType()) {
+ case nodeAdded:
+ {
+ List l = msg.getNodes();
+ if(l != null && l.size() > 0) {
+ for(ManagementServerHostVO mshost: l) {
+ _mshostPeerDao.updatePeerInfo(_mshostId, mshost.getId(), mshost.getRunid(), ManagementServerHost.State.Up);
+ }
+ }
+ }
+ break;
+
+ case nodeRemoved:
+ {
+ List l = msg.getNodes();
+ if(l != null && l.size() > 0) {
+ for(ManagementServerHostVO mshost: l) {
+ _mshostPeerDao.updatePeerInfo(_mshostId, mshost.getId(), mshost.getRunid(), ManagementServerHost.State.Down);
+ }
+ }
+ }
+ break;
+
+ default :
+ break;
+
+ }
}
private ClusterManagerMessage getNextNotificationMessage() {
@@ -848,7 +881,7 @@ public class ClusterManagerImpl implements ClusterManager {
}
}
- private void peerScan() {
+ private void peerScan() throws ActiveFencingException {
Date cutTime = DateUtil.currentGMTTime();
List currentList = _mshostDao.getActiveList(new Date(cutTime.getTime() - _heartbeatThreshold));
@@ -857,6 +890,13 @@ public class ClusterManagerImpl implements ClusterManager {
List invalidatedNodeList = new ArrayList();
if(_mshostId != null) {
+
+ if(_mshostPeerDao.countStateSeenInPeers(_mshostId, _runId, ManagementServerHost.State.Down) > 0) {
+ String msg = "We have detected that at least one management server peer reports that this management server is down, perform active fencing to avoid split-brain situation";
+ s_logger.error(msg);
+ throw new ActiveFencingException(msg);
+ }
+
// only if we have already attached to cluster, will we start to check leaving nodes
for(Map.Entry entry : _activePeers.entrySet()) {
@@ -1007,6 +1047,8 @@ public class ClusterManagerImpl implements ClusterManager {
if (s_logger.isInfoEnabled()) {
s_logger.info("Management server (host id : " + _mshostId + ") is being started at " + _clusterNodeIP + ":" + _currentServiceAdapter.getServicePort());
}
+
+ _mshostPeerDao.clearPeerInfo(_mshostId);
// use seperate thread for heartbeat updates
_heartbeatScheduler.scheduleAtFixedRate(getHeartbeatTask(), _heartbeatInterval, _heartbeatInterval, TimeUnit.MILLISECONDS);
@@ -1067,7 +1109,12 @@ public class ClusterManagerImpl implements ClusterManager {
if (_mshostDao == null) {
throw new ConfigurationException("Unable to get " + ManagementServerHostDao.class.getName());
}
-
+
+ _mshostPeerDao = locator.getDao(ManagementServerHostPeerDao.class);
+ if (_mshostPeerDao == null) {
+ throw new ConfigurationException("Unable to get " + ManagementServerHostPeerDao.class.getName());
+ }
+
_hostDao = locator.getDao(HostDao.class);
if (_hostDao == null) {
throw new ConfigurationException("Unable to get " + HostDao.class.getName());
diff --git a/server/src/com/cloud/cluster/ManagementServerHostPeerVO.java b/server/src/com/cloud/cluster/ManagementServerHostPeerVO.java
new file mode 100644
index 00000000000..eafbe939156
--- /dev/null
+++ b/server/src/com/cloud/cluster/ManagementServerHostPeerVO.java
@@ -0,0 +1,121 @@
+/**
+ * Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
+ *
+ * This software is licensed under the GNU General Public License v3 or later.
+ *
+ * It is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ *
+ */
+
+package com.cloud.cluster;
+
+import java.util.Date;
+
+import javax.persistence.Column;
+import javax.persistence.Entity;
+import javax.persistence.EnumType;
+import javax.persistence.Enumerated;
+import javax.persistence.GeneratedValue;
+import javax.persistence.GenerationType;
+import javax.persistence.Id;
+import javax.persistence.Table;
+import javax.persistence.Temporal;
+import javax.persistence.TemporalType;
+
+import com.cloud.utils.DateUtil;
+
+@Entity
+@Table(name="mshost_peer")
+public class ManagementServerHostPeerVO {
+
+ @Id
+ @GeneratedValue(strategy=GenerationType.IDENTITY)
+ @Column(name="id")
+ private long id;
+
+ @Column(name="owner_mshost", updatable=true, nullable=false)
+ private long ownerMshost;
+
+ @Column(name="peer_mshost", updatable=true, nullable=false)
+ private long peerMshost;
+
+ @Column(name="peer_runid", updatable=true, nullable=false)
+ private long peerRunid;
+
+ @Column(name="peer_state", updatable = true, nullable=false)
+ @Enumerated(value=EnumType.STRING)
+ private ManagementServerHost.State peerState;
+
+ @Temporal(TemporalType.TIMESTAMP)
+ @Column(name="last_update", updatable=true, nullable=true)
+ private Date lastUpdateTime;
+
+ public ManagementServerHostPeerVO() {
+ }
+
+ public ManagementServerHostPeerVO(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState) {
+ this.ownerMshost = ownerMshost;
+ this.peerMshost = peerMshost;
+ this.peerRunid = peerRunid;
+ this.peerState = peerState;
+
+ this.lastUpdateTime = DateUtil.currentGMTTime();
+ }
+
+ public long getId() {
+ return id;
+ }
+
+ public void setId(long id) {
+ this.id = id;
+ }
+
+ public long getOwnerMshost() {
+ return ownerMshost;
+ }
+
+ public void setOwnerMshost(long ownerMshost) {
+ this.ownerMshost = ownerMshost;
+ }
+
+ public long getPeerMshost() {
+ return peerMshost;
+ }
+
+ public void setPeerMshost(long peerMshost) {
+ this.peerMshost = peerMshost;
+ }
+
+ public long getPeerRunid() {
+ return peerRunid;
+ }
+
+ public void setPeerRunid(long peerRunid) {
+ this.peerRunid = peerRunid;
+ }
+
+ public ManagementServerHost.State getPeerState() {
+ return peerState;
+ }
+
+ public void setPeerState(ManagementServerHost.State peerState) {
+ this.peerState = peerState;
+ }
+
+ public Date getLastUpdateTime() {
+ return lastUpdateTime;
+ }
+
+ public void setLastUpdateTime(Date lastUpdateTime) {
+ this.lastUpdateTime = lastUpdateTime;
+ }
+}
diff --git a/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDao.java b/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDao.java
new file mode 100644
index 00000000000..599996e2ba5
--- /dev/null
+++ b/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDao.java
@@ -0,0 +1,29 @@
+/**
+ * Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
+ *
+ * This software is licensed under the GNU General Public License v3 or later.
+ *
+ * It is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ *
+ */
+
+package com.cloud.cluster.dao;
+
+import com.cloud.cluster.ManagementServerHost;
+import com.cloud.cluster.ManagementServerHostPeerVO;
+import com.cloud.utils.db.GenericDao;
+
+public interface ManagementServerHostPeerDao extends GenericDao {
+ void clearPeerInfo(long ownerMshost);
+ void updatePeerInfo(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState);
+ int countStateSeenInPeers(long mshost, long runid, ManagementServerHost.State state);
+}
diff --git a/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDaoImpl.java b/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDaoImpl.java
new file mode 100644
index 00000000000..658f98efa44
--- /dev/null
+++ b/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDaoImpl.java
@@ -0,0 +1,108 @@
+/**
+ * Copyright (C) 2010 Cloud.com, Inc. All rights reserved.
+ *
+ * This software is licensed under the GNU General Public License v3 or later.
+ *
+ * It is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ *
+ */
+
+package com.cloud.cluster.dao;
+
+import java.util.List;
+
+import javax.ejb.Local;
+
+import org.apache.log4j.Logger;
+
+import com.cloud.cluster.ManagementServerHost;
+import com.cloud.cluster.ManagementServerHostPeerVO;
+import com.cloud.utils.db.DB;
+import com.cloud.utils.db.GenericDaoBase;
+import com.cloud.utils.db.SearchBuilder;
+import com.cloud.utils.db.SearchCriteria;
+import com.cloud.utils.db.Transaction;
+
+@Local(value={ManagementServerHostPeerDao.class})
+public class ManagementServerHostPeerDaoImpl extends GenericDaoBase implements ManagementServerHostPeerDao {
+ private static final Logger s_logger = Logger.getLogger(ManagementServerHostPeerDaoImpl.class);
+
+ private final SearchBuilder ClearPeerSearch;
+ private final SearchBuilder FindForUpdateSearch;
+ private final SearchBuilder CountSearch;
+
+ public ManagementServerHostPeerDaoImpl() {
+ ClearPeerSearch = createSearchBuilder();
+ ClearPeerSearch.and("ownerMshost", ClearPeerSearch.entity().getOwnerMshost(), SearchCriteria.Op.EQ);
+ ClearPeerSearch.done();
+
+ FindForUpdateSearch = createSearchBuilder();
+ FindForUpdateSearch.and("ownerMshost", FindForUpdateSearch.entity().getOwnerMshost(), SearchCriteria.Op.EQ);
+ FindForUpdateSearch.and("peerMshost", FindForUpdateSearch.entity().getPeerMshost(), SearchCriteria.Op.EQ);
+ FindForUpdateSearch.and("peerRunid", FindForUpdateSearch.entity().getPeerRunid(), SearchCriteria.Op.EQ);
+ FindForUpdateSearch.done();
+
+ CountSearch = createSearchBuilder();
+ CountSearch.and("peerMshost", CountSearch.entity().getPeerMshost(), SearchCriteria.Op.EQ);
+ CountSearch.and("peerRunid", CountSearch.entity().getPeerRunid(), SearchCriteria.Op.EQ);
+ CountSearch.and("peerState", CountSearch.entity().getPeerState(), SearchCriteria.Op.EQ);
+ CountSearch.done();
+ }
+
+ @Override
+ @DB
+ public void clearPeerInfo(long ownerMshost) {
+ SearchCriteria sc = ClearPeerSearch.create();
+ sc.setParameters("ownerMshost", ownerMshost);
+
+ expunge(sc);
+ }
+
+ @Override
+ @DB
+ public void updatePeerInfo(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState) {
+ Transaction txn = Transaction.currentTxn();
+ try {
+ txn.start();
+
+ SearchCriteria sc = FindForUpdateSearch.create();
+ sc.setParameters("ownerMshost", ownerMshost);
+ sc.setParameters("peerMshost", peerMshost);
+ sc.setParameters("peerRunid", peerRunid);
+ List l = listBy(sc);
+ if(l.size() == 1) {
+ ManagementServerHostPeerVO peer = l.get(0);
+ peer.setPeerState(peerState);
+ update(peer.getId(), peer);
+ } else {
+ ManagementServerHostPeerVO peer = new ManagementServerHostPeerVO(ownerMshost, peerMshost, peerRunid, peerState);
+ persist(peer);
+ }
+ txn.commit();
+ } catch(Exception e) {
+ s_logger.warn("Unexpected exception, ", e);
+ txn.rollback();
+ }
+ }
+
+ @Override
+ @DB
+ public int countStateSeenInPeers(long mshost, long runid, ManagementServerHost.State state) {
+ SearchCriteria sc = CountSearch.create();
+ sc.setParameters("peerMshost", mshost);
+ sc.setParameters("peerRunid", runid);
+ sc.setParameters("peerState", state);
+
+ List l = listBy(sc);
+ return l.size();
+ }
+}
diff --git a/server/src/com/cloud/configuration/DefaultComponentLibrary.java b/server/src/com/cloud/configuration/DefaultComponentLibrary.java
index e04946b7c4b..d1b1e4b1c30 100755
--- a/server/src/com/cloud/configuration/DefaultComponentLibrary.java
+++ b/server/src/com/cloud/configuration/DefaultComponentLibrary.java
@@ -39,6 +39,7 @@ import com.cloud.cluster.ClusterFenceManagerImpl;
import com.cloud.cluster.ClusterManagerImpl;
import com.cloud.cluster.agentlb.dao.HostTransferMapDaoImpl;
import com.cloud.cluster.dao.ManagementServerHostDaoImpl;
+import com.cloud.cluster.dao.ManagementServerHostPeerDaoImpl;
import com.cloud.cluster.dao.StackMaidDaoImpl;
import com.cloud.configuration.dao.ConfigurationDaoImpl;
import com.cloud.configuration.dao.ResourceCountDaoImpl;
@@ -262,6 +263,7 @@ public class DefaultComponentLibrary extends ComponentLibraryBase implements Com
addDao("ConsoleProxyDao", ConsoleProxyDaoImpl.class);
addDao("SecondaryStorageVmDao", SecondaryStorageVmDaoImpl.class);
addDao("ManagementServerHostDao", ManagementServerHostDaoImpl.class);
+ addDao("ManagementServerHostPeerDao", ManagementServerHostPeerDaoImpl.class);
addDao("AgentUpgradeDao", AgentUpgradeDaoImpl.class);
addDao("SnapshotDao", SnapshotDaoImpl.class);
addDao("AsyncJobDao", AsyncJobDaoImpl.class);
diff --git a/setup/db/create-schema.sql b/setup/db/create-schema.sql
index 6d332071a3e..951d415854e 100755
--- a/setup/db/create-schema.sql
+++ b/setup/db/create-schema.sql
@@ -789,6 +789,20 @@ CREATE TABLE `cloud`.`mshost` (
INDEX `i_mshost__last_update`(`last_update`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+CREATE TABLE `cloud`.`mshost_peer` (
+ `id` bigint unsigned NOT NULL auto_increment,
+ `owner_mshost` bigint unsigned NOT NULL,
+ `peer_mshost` bigint unsigned NOT NULL,
+ `peer_runid` bigint NOT NULL,
+ `peer_state` varchar(10) NOT NULL DEFAULT 'Down',
+ `last_update` DATETIME NULL COMMENT 'Last record update time',
+
+ PRIMARY KEY (`id`),
+ CONSTRAINT `fk_mshost_peer__owner_mshost` FOREIGN KEY (`owner_mshost`) REFERENCES `mshost`(`id`) ON DELETE CASCADE,
+ CONSTRAINT `fk_mshost_peer__peer_mshost` FOREIGN KEY (`peer_mshost`) REFERENCES `mshost`(`id`),
+ UNIQUE `i_mshost_peer__owner_peer_runid`(`owner_mshost`, `peer_mshost`, `peer_runid`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+
CREATE TABLE `cloud`.`host_tags` (
`id` bigint unsigned NOT NULL auto_increment,
`host_id` bigint unsigned NOT NULL COMMENT 'host id',
diff --git a/setup/db/db/schema-2213to2214.sql b/setup/db/db/schema-2213to2214.sql
index 4673197789f..1577986f9e3 100644
--- a/setup/db/db/schema-2213to2214.sql
+++ b/setup/db/db/schema-2213to2214.sql
@@ -4,3 +4,18 @@
ALTER TABLE `cloud`.`vm_template` MODIFY `extractable` int(1) unsigned NOT NULL default 0 COMMENT 'Is this template extractable';
INSERT INTO configuration (category, instance, component, name, value, description) VALUES ('Advanced', 'DEFAULT', 'management-server', 'external.network.stats.interval', '300', 'Interval (in seconds) to report external network statistics.');
+
+CREATE TABLE `cloud`.`mshost_peer` (
+ `id` bigint unsigned NOT NULL auto_increment,
+ `owner_mshost` bigint unsigned NOT NULL,
+ `peer_mshost` bigint unsigned NOT NULL,
+ `peer_runid` bigint NOT NULL,
+ `peer_state` varchar(10) NOT NULL DEFAULT 'Down',
+ `last_update` DATETIME NULL COMMENT 'Last record update time',
+
+ PRIMARY KEY (`id`),
+ CONSTRAINT `fk_mshost_peer__owner_mshost` FOREIGN KEY (`owner_mshost`) REFERENCES `mshost`(`id`) ON DELETE CASCADE,
+ CONSTRAINT `fk_mshost_peer__peer_mshost` FOREIGN KEY (`peer_mshost`) REFERENCES `mshost`(`id`),
+ UNIQUE `i_mshost_peer__owner_peer_runid`(`owner_mshost`, `peer_mshost`, `peer_runid`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+