bug 12709: incremental fix - profiling management server clustering heartbeat activities

2012-01-03 17:51:25 -08:00 · 2012-01-03 17:51:25 -08:00 · 3750c7055b
parent 277fdc229b
commit 3750c7055b
2 changed files with 87 additions and 43 deletions
--- a/server/src/com/cloud/cluster/ClusterManagerImpl.java
+++ b/server/src/com/cloud/cluster/ClusterManagerImpl.java
@ -593,43 +593,66 @@ public class ClusterManagerImpl implements ClusterManager {
            public void run() {
                Transaction txn = Transaction.open("ClusterHeartBeat");
                try {
-                    txn.transitToUserManagedConnection(getHeartbeatConnection());
-                    if(s_logger.isTraceEnabled()) {
-                        s_logger.trace("Cluster manager heartbeat update, id:" + _mshostId);
-                    }
-
-                    _mshostDao.update(_mshostId, getCurrentRunId(), DateUtil.currentGMTTime());
-
-                    if (s_logger.isTraceEnabled()) {
-                        s_logger.trace("Cluster manager peer-scan, id:" + _mshostId);
-                    }
-
-                    if (!_peerScanInited) {
-                        _peerScanInited = true;
-                        initPeerScan();
-                    }
-
-                    peerScan();
+                    Profiler profiler = new Profiler();
+                    Profiler profilerHeartbeatUpdate = new Profiler();
+                    Profiler profilerPeerScan = new Profiler();
+                    Profiler profilerAgentLB = new Profiler();
                    
-                    //initiate agent lb task will be scheduled and executed only once, and only when number of agents loaded exceeds _connectedAgentsThreshold
-                    if (_agentLBEnabled && !_agentLbHappened) {
-                        List<HostVO> allManagedRoutingAgents = _hostDao.listManagedRoutingAgents();
-                        List<HostVO> allAgents = _hostDao.listAllRoutingAgents();
-                        double allHostsCount = allAgents.size();
-                        double managedHostsCount = allManagedRoutingAgents.size();
-                        if (allHostsCount > 0.0) {
-                            double load = managedHostsCount/allHostsCount;
-                            if (load >= _connectedAgentsThreshold) {
-                                s_logger.debug("Scheduling agent rebalancing task as the average agent load " + load + " is more than the threshold " + _connectedAgentsThreshold);
-                                _rebalanceService.scheduleRebalanceAgents();
-                                _agentLbHappened = true;
-                            } else {
-                                s_logger.trace("Not scheduling agent rebalancing task as the averages load " + load + " is less than the threshold " + _connectedAgentsThreshold);
-                            }
-                        } 
+                    try {
+                        profiler.start();
+                        
+                        profilerHeartbeatUpdate.start();
+                        txn.transitToUserManagedConnection(getHeartbeatConnection());
+                        if(s_logger.isTraceEnabled()) {
+                            s_logger.trace("Cluster manager heartbeat update, id:" + _mshostId);
+                        }
+    
+                        _mshostDao.update(_mshostId, getCurrentRunId(), DateUtil.currentGMTTime());
+                        profilerHeartbeatUpdate.stop();
+    
+                        profilerPeerScan.start();
+                        if (s_logger.isTraceEnabled()) {
+                            s_logger.trace("Cluster manager peer-scan, id:" + _mshostId);
+                        }
+    
+                        if (!_peerScanInited) {
+                            _peerScanInited = true;
+                            initPeerScan();
+                        }
+    
+                        peerScan();
+                        profilerPeerScan.stop();
+                        
+                        profilerAgentLB.start();
+                        //initiate agent lb task will be scheduled and executed only once, and only when number of agents loaded exceeds _connectedAgentsThreshold
+                        if (_agentLBEnabled && !_agentLbHappened) {
+                            List<HostVO> allManagedRoutingAgents = _hostDao.listManagedRoutingAgents();
+                            List<HostVO> allAgents = _hostDao.listAllRoutingAgents();
+                            double allHostsCount = allAgents.size();
+                            double managedHostsCount = allManagedRoutingAgents.size();
+                            if (allHostsCount > 0.0) {
+                                double load = managedHostsCount/allHostsCount;
+                                if (load >= _connectedAgentsThreshold) {
+                                    s_logger.debug("Scheduling agent rebalancing task as the average agent load " + load + " is more than the threshold " + _connectedAgentsThreshold);
+                                    _rebalanceService.scheduleRebalanceAgents();
+                                    _agentLbHappened = true;
+                                } else {
+                                    s_logger.trace("Not scheduling agent rebalancing task as the averages load " + load + " is less than the threshold " + _connectedAgentsThreshold);
+                                }
+                            } 
+                        }
+                        profilerAgentLB.stop();
+                    } finally {
+                        profiler.stop();
+                        
+                        if(profiler.getDuration() >= _heartbeatInterval) {
+                            s_logger.warn("Management server heartbeat takes too long to finish. profiler: " + profiler.toString() + 
+                                ", profilerHeartbeatUpdate: " + profilerHeartbeatUpdate.toString() +
+                                ", profilerPeerScan: " + profilerPeerScan.toString() +
+                                ", profilerAgentLB: " + profilerAgentLB.toString());
+                        }
                    }
                    
-                    
                } catch(CloudRuntimeException e) {
                    s_logger.error("Runtime DB exception ", e.getCause());

--- a/utils/src/com/cloud/utils/Profiler.java
+++ b/utils/src/com/cloud/utils/Profiler.java
@ -19,25 +19,46 @@
 package com.cloud.utils;

 public class Profiler {
-	private long startTickInMs;
-	private long stopTickInMs;
+	private Long startTickInMs;
+	private Long stopTickInMs;
 	
 	public Profiler() {
-		startTickInMs = 0;
-		stopTickInMs = 0;
+		startTickInMs = null;
+		stopTickInMs = null;
 	}
 	
 	public long start() {
 		startTickInMs = System.currentTimeMillis();
-		return startTickInMs;
+		return startTickInMs.longValue();
 	}
 	
 	public long stop() {
 		stopTickInMs = System.currentTimeMillis();
-		return stopTickInMs;
+		return stopTickInMs.longValue();
 	}
 	
-	public long getDuration() {
-		return stopTickInMs - startTickInMs;
-	}
+	public long getDuration() {
+	    if(startTickInMs != null &&  stopTickInMs != null)
+	        return stopTickInMs.longValue() - startTickInMs.longValue();
+	    
+	    return -1;
+	}
+	
+	public boolean isStarted() {
+	    return startTickInMs != null;
+	}
+	
+	public boolean isStopped() {
+	    return stopTickInMs != null;
+	}
+	
+	public String toString() {
+	    if(startTickInMs == null)
+	        return "Not Started";
+	    
+	    if(stopTickInMs == null)
+	        return "Started but not stopped";
+	    
+	    return "Done. Duration: " + getDuration() + "ms";
+	}
 }