From dcdd87b30fddf6b708b2d8ff41ef8ae5a2485828 Mon Sep 17 00:00:00 2001 From: Kelven Yang Date: Wed, 4 Jan 2012 18:06:15 -0800 Subject: [PATCH] bug 12709: add management server active fencing --- .../cloud/cluster/ActiveFencingException.java | 31 +++++ .../com/cloud/cluster/ClusterManagerImpl.java | 51 +++++++- .../cluster/ManagementServerHostPeerVO.java | 121 ++++++++++++++++++ .../dao/ManagementServerHostPeerDao.java | 29 +++++ .../dao/ManagementServerHostPeerDaoImpl.java | 108 ++++++++++++++++ .../DefaultComponentLibrary.java | 2 + setup/db/create-schema.sql | 14 ++ setup/db/db/schema-2213to2214.sql | 15 +++ 8 files changed, 369 insertions(+), 2 deletions(-) create mode 100644 server/src/com/cloud/cluster/ActiveFencingException.java create mode 100644 server/src/com/cloud/cluster/ManagementServerHostPeerVO.java create mode 100644 server/src/com/cloud/cluster/dao/ManagementServerHostPeerDao.java create mode 100644 server/src/com/cloud/cluster/dao/ManagementServerHostPeerDaoImpl.java diff --git a/server/src/com/cloud/cluster/ActiveFencingException.java b/server/src/com/cloud/cluster/ActiveFencingException.java new file mode 100644 index 00000000000..65149d5e353 --- /dev/null +++ b/server/src/com/cloud/cluster/ActiveFencingException.java @@ -0,0 +1,31 @@ +/** + * Copyright (C) 2010 Cloud.com, Inc. All rights reserved. + * + * This software is licensed under the GNU General Public License v3 or later. + * + * It is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +package com.cloud.cluster; + +public class ActiveFencingException extends Exception { + private static final long serialVersionUID = -3975376101728211726L; + + public ActiveFencingException(String message) { + super(message); + } + + public ActiveFencingException(String message, Throwable th) { + super(message, th); + } +} diff --git a/server/src/com/cloud/cluster/ClusterManagerImpl.java b/server/src/com/cloud/cluster/ClusterManagerImpl.java index 02aa52a078a..bb5bac3d060 100755 --- a/server/src/com/cloud/cluster/ClusterManagerImpl.java +++ b/server/src/com/cloud/cluster/ClusterManagerImpl.java @@ -51,8 +51,10 @@ import com.cloud.agent.api.ChangeAgentCommand; import com.cloud.agent.api.Command; import com.cloud.agent.api.PropagateResourceEventCommand; import com.cloud.agent.manager.Commands; +import com.cloud.cluster.ManagementServerHost.State; import com.cloud.cluster.agentlb.dao.HostTransferMapDao; import com.cloud.cluster.dao.ManagementServerHostDao; +import com.cloud.cluster.dao.ManagementServerHostPeerDao; import com.cloud.configuration.Config; import com.cloud.configuration.dao.ConfigurationDao; import com.cloud.exception.AgentUnavailableException; @@ -119,6 +121,7 @@ public class ClusterManagerImpl implements ClusterManager { private ClusterServiceAdapter _currentServiceAdapter; private ManagementServerHostDao _mshostDao; + private ManagementServerHostPeerDao _mshostPeerDao; private HostDao _hostDao; private HostTransferMapDao _hostTransferDao; @@ -683,6 +686,8 @@ public class ClusterManagerImpl implements ClusterManager { } invalidHeartbeatConnection(); + } catch(ActiveFencingException e) { + queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated)); } catch (Throwable e) { if(isRootCauseConnectionRelated(e.getCause())) { s_logger.error("DB communication problem detected"); @@ -804,6 +809,34 @@ public class ClusterManagerImpl implements ClusterManager { this._notificationMsgs.add(msg); this._notificationMsgs.notifyAll(); } + + switch(msg.getMessageType()) { + case nodeAdded: + { + List l = msg.getNodes(); + if(l != null && l.size() > 0) { + for(ManagementServerHostVO mshost: l) { + _mshostPeerDao.updatePeerInfo(_mshostId, mshost.getId(), mshost.getRunid(), ManagementServerHost.State.Up); + } + } + } + break; + + case nodeRemoved: + { + List l = msg.getNodes(); + if(l != null && l.size() > 0) { + for(ManagementServerHostVO mshost: l) { + _mshostPeerDao.updatePeerInfo(_mshostId, mshost.getId(), mshost.getRunid(), ManagementServerHost.State.Down); + } + } + } + break; + + default : + break; + + } } private ClusterManagerMessage getNextNotificationMessage() { @@ -848,7 +881,7 @@ public class ClusterManagerImpl implements ClusterManager { } } - private void peerScan() { + private void peerScan() throws ActiveFencingException { Date cutTime = DateUtil.currentGMTTime(); List currentList = _mshostDao.getActiveList(new Date(cutTime.getTime() - _heartbeatThreshold)); @@ -857,6 +890,13 @@ public class ClusterManagerImpl implements ClusterManager { List invalidatedNodeList = new ArrayList(); if(_mshostId != null) { + + if(_mshostPeerDao.countStateSeenInPeers(_mshostId, _runId, ManagementServerHost.State.Down) > 0) { + String msg = "We have detected that at least one management server peer reports that this management server is down, perform active fencing to avoid split-brain situation"; + s_logger.error(msg); + throw new ActiveFencingException(msg); + } + // only if we have already attached to cluster, will we start to check leaving nodes for(Map.Entry entry : _activePeers.entrySet()) { @@ -1007,6 +1047,8 @@ public class ClusterManagerImpl implements ClusterManager { if (s_logger.isInfoEnabled()) { s_logger.info("Management server (host id : " + _mshostId + ") is being started at " + _clusterNodeIP + ":" + _currentServiceAdapter.getServicePort()); } + + _mshostPeerDao.clearPeerInfo(_mshostId); // use seperate thread for heartbeat updates _heartbeatScheduler.scheduleAtFixedRate(getHeartbeatTask(), _heartbeatInterval, _heartbeatInterval, TimeUnit.MILLISECONDS); @@ -1067,7 +1109,12 @@ public class ClusterManagerImpl implements ClusterManager { if (_mshostDao == null) { throw new ConfigurationException("Unable to get " + ManagementServerHostDao.class.getName()); } - + + _mshostPeerDao = locator.getDao(ManagementServerHostPeerDao.class); + if (_mshostPeerDao == null) { + throw new ConfigurationException("Unable to get " + ManagementServerHostPeerDao.class.getName()); + } + _hostDao = locator.getDao(HostDao.class); if (_hostDao == null) { throw new ConfigurationException("Unable to get " + HostDao.class.getName()); diff --git a/server/src/com/cloud/cluster/ManagementServerHostPeerVO.java b/server/src/com/cloud/cluster/ManagementServerHostPeerVO.java new file mode 100644 index 00000000000..eafbe939156 --- /dev/null +++ b/server/src/com/cloud/cluster/ManagementServerHostPeerVO.java @@ -0,0 +1,121 @@ +/** + * Copyright (C) 2010 Cloud.com, Inc. All rights reserved. + * + * This software is licensed under the GNU General Public License v3 or later. + * + * It is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +package com.cloud.cluster; + +import java.util.Date; + +import javax.persistence.Column; +import javax.persistence.Entity; +import javax.persistence.EnumType; +import javax.persistence.Enumerated; +import javax.persistence.GeneratedValue; +import javax.persistence.GenerationType; +import javax.persistence.Id; +import javax.persistence.Table; +import javax.persistence.Temporal; +import javax.persistence.TemporalType; + +import com.cloud.utils.DateUtil; + +@Entity +@Table(name="mshost_peer") +public class ManagementServerHostPeerVO { + + @Id + @GeneratedValue(strategy=GenerationType.IDENTITY) + @Column(name="id") + private long id; + + @Column(name="owner_mshost", updatable=true, nullable=false) + private long ownerMshost; + + @Column(name="peer_mshost", updatable=true, nullable=false) + private long peerMshost; + + @Column(name="peer_runid", updatable=true, nullable=false) + private long peerRunid; + + @Column(name="peer_state", updatable = true, nullable=false) + @Enumerated(value=EnumType.STRING) + private ManagementServerHost.State peerState; + + @Temporal(TemporalType.TIMESTAMP) + @Column(name="last_update", updatable=true, nullable=true) + private Date lastUpdateTime; + + public ManagementServerHostPeerVO() { + } + + public ManagementServerHostPeerVO(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState) { + this.ownerMshost = ownerMshost; + this.peerMshost = peerMshost; + this.peerRunid = peerRunid; + this.peerState = peerState; + + this.lastUpdateTime = DateUtil.currentGMTTime(); + } + + public long getId() { + return id; + } + + public void setId(long id) { + this.id = id; + } + + public long getOwnerMshost() { + return ownerMshost; + } + + public void setOwnerMshost(long ownerMshost) { + this.ownerMshost = ownerMshost; + } + + public long getPeerMshost() { + return peerMshost; + } + + public void setPeerMshost(long peerMshost) { + this.peerMshost = peerMshost; + } + + public long getPeerRunid() { + return peerRunid; + } + + public void setPeerRunid(long peerRunid) { + this.peerRunid = peerRunid; + } + + public ManagementServerHost.State getPeerState() { + return peerState; + } + + public void setPeerState(ManagementServerHost.State peerState) { + this.peerState = peerState; + } + + public Date getLastUpdateTime() { + return lastUpdateTime; + } + + public void setLastUpdateTime(Date lastUpdateTime) { + this.lastUpdateTime = lastUpdateTime; + } +} diff --git a/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDao.java b/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDao.java new file mode 100644 index 00000000000..599996e2ba5 --- /dev/null +++ b/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDao.java @@ -0,0 +1,29 @@ +/** + * Copyright (C) 2010 Cloud.com, Inc. All rights reserved. + * + * This software is licensed under the GNU General Public License v3 or later. + * + * It is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +package com.cloud.cluster.dao; + +import com.cloud.cluster.ManagementServerHost; +import com.cloud.cluster.ManagementServerHostPeerVO; +import com.cloud.utils.db.GenericDao; + +public interface ManagementServerHostPeerDao extends GenericDao { + void clearPeerInfo(long ownerMshost); + void updatePeerInfo(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState); + int countStateSeenInPeers(long mshost, long runid, ManagementServerHost.State state); +} diff --git a/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDaoImpl.java b/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDaoImpl.java new file mode 100644 index 00000000000..658f98efa44 --- /dev/null +++ b/server/src/com/cloud/cluster/dao/ManagementServerHostPeerDaoImpl.java @@ -0,0 +1,108 @@ +/** + * Copyright (C) 2010 Cloud.com, Inc. All rights reserved. + * + * This software is licensed under the GNU General Public License v3 or later. + * + * It is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +package com.cloud.cluster.dao; + +import java.util.List; + +import javax.ejb.Local; + +import org.apache.log4j.Logger; + +import com.cloud.cluster.ManagementServerHost; +import com.cloud.cluster.ManagementServerHostPeerVO; +import com.cloud.utils.db.DB; +import com.cloud.utils.db.GenericDaoBase; +import com.cloud.utils.db.SearchBuilder; +import com.cloud.utils.db.SearchCriteria; +import com.cloud.utils.db.Transaction; + +@Local(value={ManagementServerHostPeerDao.class}) +public class ManagementServerHostPeerDaoImpl extends GenericDaoBase implements ManagementServerHostPeerDao { + private static final Logger s_logger = Logger.getLogger(ManagementServerHostPeerDaoImpl.class); + + private final SearchBuilder ClearPeerSearch; + private final SearchBuilder FindForUpdateSearch; + private final SearchBuilder CountSearch; + + public ManagementServerHostPeerDaoImpl() { + ClearPeerSearch = createSearchBuilder(); + ClearPeerSearch.and("ownerMshost", ClearPeerSearch.entity().getOwnerMshost(), SearchCriteria.Op.EQ); + ClearPeerSearch.done(); + + FindForUpdateSearch = createSearchBuilder(); + FindForUpdateSearch.and("ownerMshost", FindForUpdateSearch.entity().getOwnerMshost(), SearchCriteria.Op.EQ); + FindForUpdateSearch.and("peerMshost", FindForUpdateSearch.entity().getPeerMshost(), SearchCriteria.Op.EQ); + FindForUpdateSearch.and("peerRunid", FindForUpdateSearch.entity().getPeerRunid(), SearchCriteria.Op.EQ); + FindForUpdateSearch.done(); + + CountSearch = createSearchBuilder(); + CountSearch.and("peerMshost", CountSearch.entity().getPeerMshost(), SearchCriteria.Op.EQ); + CountSearch.and("peerRunid", CountSearch.entity().getPeerRunid(), SearchCriteria.Op.EQ); + CountSearch.and("peerState", CountSearch.entity().getPeerState(), SearchCriteria.Op.EQ); + CountSearch.done(); + } + + @Override + @DB + public void clearPeerInfo(long ownerMshost) { + SearchCriteria sc = ClearPeerSearch.create(); + sc.setParameters("ownerMshost", ownerMshost); + + expunge(sc); + } + + @Override + @DB + public void updatePeerInfo(long ownerMshost, long peerMshost, long peerRunid, ManagementServerHost.State peerState) { + Transaction txn = Transaction.currentTxn(); + try { + txn.start(); + + SearchCriteria sc = FindForUpdateSearch.create(); + sc.setParameters("ownerMshost", ownerMshost); + sc.setParameters("peerMshost", peerMshost); + sc.setParameters("peerRunid", peerRunid); + List l = listBy(sc); + if(l.size() == 1) { + ManagementServerHostPeerVO peer = l.get(0); + peer.setPeerState(peerState); + update(peer.getId(), peer); + } else { + ManagementServerHostPeerVO peer = new ManagementServerHostPeerVO(ownerMshost, peerMshost, peerRunid, peerState); + persist(peer); + } + txn.commit(); + } catch(Exception e) { + s_logger.warn("Unexpected exception, ", e); + txn.rollback(); + } + } + + @Override + @DB + public int countStateSeenInPeers(long mshost, long runid, ManagementServerHost.State state) { + SearchCriteria sc = CountSearch.create(); + sc.setParameters("peerMshost", mshost); + sc.setParameters("peerRunid", runid); + sc.setParameters("peerState", state); + + List l = listBy(sc); + return l.size(); + } +} diff --git a/server/src/com/cloud/configuration/DefaultComponentLibrary.java b/server/src/com/cloud/configuration/DefaultComponentLibrary.java index e04946b7c4b..d1b1e4b1c30 100755 --- a/server/src/com/cloud/configuration/DefaultComponentLibrary.java +++ b/server/src/com/cloud/configuration/DefaultComponentLibrary.java @@ -39,6 +39,7 @@ import com.cloud.cluster.ClusterFenceManagerImpl; import com.cloud.cluster.ClusterManagerImpl; import com.cloud.cluster.agentlb.dao.HostTransferMapDaoImpl; import com.cloud.cluster.dao.ManagementServerHostDaoImpl; +import com.cloud.cluster.dao.ManagementServerHostPeerDaoImpl; import com.cloud.cluster.dao.StackMaidDaoImpl; import com.cloud.configuration.dao.ConfigurationDaoImpl; import com.cloud.configuration.dao.ResourceCountDaoImpl; @@ -262,6 +263,7 @@ public class DefaultComponentLibrary extends ComponentLibraryBase implements Com addDao("ConsoleProxyDao", ConsoleProxyDaoImpl.class); addDao("SecondaryStorageVmDao", SecondaryStorageVmDaoImpl.class); addDao("ManagementServerHostDao", ManagementServerHostDaoImpl.class); + addDao("ManagementServerHostPeerDao", ManagementServerHostPeerDaoImpl.class); addDao("AgentUpgradeDao", AgentUpgradeDaoImpl.class); addDao("SnapshotDao", SnapshotDaoImpl.class); addDao("AsyncJobDao", AsyncJobDaoImpl.class); diff --git a/setup/db/create-schema.sql b/setup/db/create-schema.sql index 6d332071a3e..951d415854e 100755 --- a/setup/db/create-schema.sql +++ b/setup/db/create-schema.sql @@ -789,6 +789,20 @@ CREATE TABLE `cloud`.`mshost` ( INDEX `i_mshost__last_update`(`last_update`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; +CREATE TABLE `cloud`.`mshost_peer` ( + `id` bigint unsigned NOT NULL auto_increment, + `owner_mshost` bigint unsigned NOT NULL, + `peer_mshost` bigint unsigned NOT NULL, + `peer_runid` bigint NOT NULL, + `peer_state` varchar(10) NOT NULL DEFAULT 'Down', + `last_update` DATETIME NULL COMMENT 'Last record update time', + + PRIMARY KEY (`id`), + CONSTRAINT `fk_mshost_peer__owner_mshost` FOREIGN KEY (`owner_mshost`) REFERENCES `mshost`(`id`) ON DELETE CASCADE, + CONSTRAINT `fk_mshost_peer__peer_mshost` FOREIGN KEY (`peer_mshost`) REFERENCES `mshost`(`id`), + UNIQUE `i_mshost_peer__owner_peer_runid`(`owner_mshost`, `peer_mshost`, `peer_runid`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + CREATE TABLE `cloud`.`host_tags` ( `id` bigint unsigned NOT NULL auto_increment, `host_id` bigint unsigned NOT NULL COMMENT 'host id', diff --git a/setup/db/db/schema-2213to2214.sql b/setup/db/db/schema-2213to2214.sql index 4673197789f..1577986f9e3 100644 --- a/setup/db/db/schema-2213to2214.sql +++ b/setup/db/db/schema-2213to2214.sql @@ -4,3 +4,18 @@ ALTER TABLE `cloud`.`vm_template` MODIFY `extractable` int(1) unsigned NOT NULL default 0 COMMENT 'Is this template extractable'; INSERT INTO configuration (category, instance, component, name, value, description) VALUES ('Advanced', 'DEFAULT', 'management-server', 'external.network.stats.interval', '300', 'Interval (in seconds) to report external network statistics.'); + +CREATE TABLE `cloud`.`mshost_peer` ( + `id` bigint unsigned NOT NULL auto_increment, + `owner_mshost` bigint unsigned NOT NULL, + `peer_mshost` bigint unsigned NOT NULL, + `peer_runid` bigint NOT NULL, + `peer_state` varchar(10) NOT NULL DEFAULT 'Down', + `last_update` DATETIME NULL COMMENT 'Last record update time', + + PRIMARY KEY (`id`), + CONSTRAINT `fk_mshost_peer__owner_mshost` FOREIGN KEY (`owner_mshost`) REFERENCES `mshost`(`id`) ON DELETE CASCADE, + CONSTRAINT `fk_mshost_peer__peer_mshost` FOREIGN KEY (`peer_mshost`) REFERENCES `mshost`(`id`), + UNIQUE `i_mshost_peer__owner_peer_runid`(`owner_mshost`, `peer_mshost`, `peer_runid`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; +