From b1d5b5a51df920fc9f745c25603935677c56bcb3 Mon Sep 17 00:00:00 2001 From: alena Date: Thu, 21 Jul 2011 14:09:08 -0700 Subject: [PATCH] 2 fixes for Agent Load Balancer: * when management server dies and notifies other management servers about this, the running management server has to cleanup host_transfer records belonging to the died management server * issue agent load balancing task only when agent load (number of connected agents in the system) exceeds "agent.load.threshold" - 70% by default --- .../manager/ClusteredAgentManagerImpl.java | 10 +++--- .../com/cloud/cluster/ClusterManagerImpl.java | 34 ++++++++++++++++--- .../agentlb/dao/HostTransferMapDaoImpl.java | 1 + .../src/com/cloud/configuration/Config.java | 4 +-- server/src/com/cloud/host/dao/HostDao.java | 2 ++ .../src/com/cloud/host/dao/HostDaoImpl.java | 12 +++++++ setup/db/db/schema-228to229.sql | 1 + 7 files changed, 53 insertions(+), 11 deletions(-) diff --git a/server/src/com/cloud/agent/manager/ClusteredAgentManagerImpl.java b/server/src/com/cloud/agent/manager/ClusteredAgentManagerImpl.java index 4fc2dbb98aa..e03daa85b24 100644 --- a/server/src/com/cloud/agent/manager/ClusteredAgentManagerImpl.java +++ b/server/src/com/cloud/agent/manager/ClusteredAgentManagerImpl.java @@ -566,7 +566,7 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust //cancel all transfer tasks s_transferExecutor.shutdownNow(); - cleanupTransferMap(); + cleanupTransferMap(_nodeId); return super.stop(); } @@ -694,6 +694,8 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust for (ManagementServerHostVO vo : nodeList) { s_logger.info("Marking hosts as disconnected on Management server" + vo.getMsid()); _hostDao.markHostsAsDisconnected(vo.getMsid()); + s_logger.info("Deleting entries from op_host_transfer table for Management server " + vo.getMsid()); + cleanupTransferMap(vo.getMsid()); } } @@ -1071,14 +1073,14 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust return true; } - protected void cleanupTransferMap() { - List hostsJoingingCluster = _hostTransferDao.listHostsJoiningCluster(_nodeId); + protected void cleanupTransferMap(long msId) { + List hostsJoingingCluster = _hostTransferDao.listHostsJoiningCluster(msId); for (HostTransferMapVO hostJoingingCluster : hostsJoingingCluster) { _hostTransferDao.remove(hostJoingingCluster.getId()); } - List hostsLeavingCluster = _hostTransferDao.listHostsLeavingCluster(_nodeId); + List hostsLeavingCluster = _hostTransferDao.listHostsLeavingCluster(msId); for (HostTransferMapVO hostLeavingCluster : hostsLeavingCluster) { _hostTransferDao.remove(hostLeavingCluster.getId()); } diff --git a/server/src/com/cloud/cluster/ClusterManagerImpl.java b/server/src/com/cloud/cluster/ClusterManagerImpl.java index ada2f5f6fda..491019fba26 100644 --- a/server/src/com/cloud/cluster/ClusterManagerImpl.java +++ b/server/src/com/cloud/cluster/ClusterManagerImpl.java @@ -127,6 +127,9 @@ public class ClusterManagerImpl implements ClusterManager { private String _name; private String _clusterNodeIP = "127.0.0.1"; private boolean _agentLBEnabled = false; + private double _connectedAgentsThreshold = 0.7; + private static boolean _agentLbHappened = false; + public ClusterManagerImpl() { clusterPeers = new HashMap(); @@ -603,6 +606,26 @@ public class ClusterManagerImpl implements ClusterManager { } peerScan(); + + //initiate agent lb task will be scheduled and executed only once, and only when number of agents loaded exceeds _connectedAgentsThreshold + if (_agentLBEnabled && !_agentLbHappened) { + List allManagedRoutingAgents = _hostDao.listManagedRoutingAgents(); + List allAgents = _hostDao.listAllRoutingAgents(); + double allHostsCount = allAgents.size(); + double managedHostsCount = allManagedRoutingAgents.size(); + if (allHostsCount > 0.0) { + double load = managedHostsCount/allHostsCount; + if (load >= _connectedAgentsThreshold) { + s_logger.debug("Scheduling agent rebalancing task as the average agent load " + load + " is more than the threshold " + _connectedAgentsThreshold); + _rebalanceService.scheduleRebalanceAgents(); + _agentLbHappened = true; + } else { + s_logger.trace("Not scheduling agent rebalancing task as the averages load " + load + " is less than the threshold " + _connectedAgentsThreshold); + } + } + } + + } catch(CloudRuntimeException e) { s_logger.error("Runtime DB exception ", e.getCause()); @@ -925,10 +948,6 @@ public class ClusterManagerImpl implements ClusterManager { _heartbeatScheduler.scheduleAtFixedRate(getHeartbeatTask(), heartbeatInterval, heartbeatInterval, TimeUnit.MILLISECONDS); _notificationExecutor.submit(getNotificationTask()); - //Initiate agent rebalancing after the host is in UP state - if (_agentLBEnabled) { - _rebalanceService.scheduleRebalanceAgents(); - } } catch (Throwable e) { s_logger.error("Unexpected exception : ", e); @@ -1051,6 +1070,12 @@ public class ClusterManagerImpl implements ClusterManager { _agentLBEnabled = Boolean.valueOf(configDao.getValue(Config.AgentLbEnable.key())); + + String connectedAgentsThreshold = configs.get("agent.load.threshold"); + + if (connectedAgentsThreshold != null) { + _connectedAgentsThreshold = Double.parseDouble(connectedAgentsThreshold); + } this.registerListener(new LockMasterListener(_msId)); @@ -1179,5 +1204,4 @@ public class ClusterManagerImpl implements ClusterManager { public boolean isAgentRebalanceEnabled() { return _agentLBEnabled; } - } diff --git a/server/src/com/cloud/cluster/agentlb/dao/HostTransferMapDaoImpl.java b/server/src/com/cloud/cluster/agentlb/dao/HostTransferMapDaoImpl.java index 0fff8268870..acef54a2b58 100644 --- a/server/src/com/cloud/cluster/agentlb/dao/HostTransferMapDaoImpl.java +++ b/server/src/com/cloud/cluster/agentlb/dao/HostTransferMapDaoImpl.java @@ -51,6 +51,7 @@ public class HostTransferMapDaoImpl extends GenericDaoBase _componentClass; diff --git a/server/src/com/cloud/host/dao/HostDao.java b/server/src/com/cloud/host/dao/HostDao.java index aa16fe604ee..30d1592346c 100644 --- a/server/src/com/cloud/host/dao/HostDao.java +++ b/server/src/com/cloud/host/dao/HostDao.java @@ -184,4 +184,6 @@ public interface HostDao extends GenericDao { List listByManagementServer(long msId); List listSecondaryStorageVM(long dcId); + + List listAllRoutingAgents(); } diff --git a/server/src/com/cloud/host/dao/HostDaoImpl.java b/server/src/com/cloud/host/dao/HostDaoImpl.java index c02260d31a1..4284568249a 100644 --- a/server/src/com/cloud/host/dao/HostDaoImpl.java +++ b/server/src/com/cloud/host/dao/HostDaoImpl.java @@ -104,6 +104,7 @@ public class HostDaoImpl extends GenericDaoBase implements HostDao protected final GenericSearchBuilder CountRoutingByDc; protected final SearchBuilder HostTransferSearch; protected final SearchBuilder ClusterManagedSearch; + protected final SearchBuilder RoutingSearch; protected final Attribute _statusAttr; protected final Attribute _msIdAttr; @@ -294,6 +295,10 @@ public class HostDaoImpl extends GenericDaoBase implements HostDao ManagedRoutingServersSearch.and("server", ManagedRoutingServersSearch.entity().getManagementServerId(), SearchCriteria.Op.NNULL); ManagedRoutingServersSearch.and("type", ManagedRoutingServersSearch.entity().getType(), SearchCriteria.Op.EQ); ManagedRoutingServersSearch.done(); + + RoutingSearch = createSearchBuilder(); + RoutingSearch.and("type", RoutingSearch.entity().getType(), SearchCriteria.Op.EQ); + RoutingSearch.done(); _statusAttr = _allAttributes.get("status"); _msIdAttr = _allAttributes.get("managementServerId"); @@ -942,4 +947,11 @@ public class HostDaoImpl extends GenericDaoBase implements HostDao return listBy(sc); } + + @Override + public List listAllRoutingAgents() { + SearchCriteria sc = RoutingSearch.create(); + sc.setParameters("type", Type.Routing); + return listBy(sc); + } } diff --git a/setup/db/db/schema-228to229.sql b/setup/db/db/schema-228to229.sql index 30a021b3262..705cbcc3943 100644 --- a/setup/db/db/schema-228to229.sql +++ b/setup/db/db/schema-228to229.sql @@ -13,3 +13,4 @@ ALTER TABLE `cloud`.`op_host_capacity` DROP FOREIGN KEY `fk_op_host_capacity__da ALTER TABLE `cloud`.`op_host_capacity` DROP FOREIGN KEY `fk_op_host_capacity__cluster_id`; INSERT IGNORE INTO configuration VALUES ('Advanced', 'DEFAULT', 'NetworkManager', 'domain.level.virtual.network', 'false', 'True if domain level virtual network is supported, false otherwise'); +INSERT IGNORE INTO configuration VALUES ('Advanced', 'DEFAULT', 'management-server', 'agent.load.threshold', '0.70', 'Percentage (as a value between 0 and 1) of connected agents after which agent load balancing will start happening');