CLOUDSTACK-5928: disable host delta sync when new VM sync is enabled

This commit is contained in:
Kelven Yang 2014-01-22 11:09:45 -08:00
parent 0679af3434
commit 50f311ed94
4 changed files with 75 additions and 52 deletions

View File

@ -337,7 +337,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
// TODO, remove it after transient period is over
static final ConfigKey<Boolean> VmJobEnabled = new ConfigKey<Boolean>("Advanced",
Boolean.class, "vm.job.enabled", "false",
Boolean.class, "vm.job.enabled", "true",
"True to enable new VM sync model. false to use the old way", false);
static final ConfigKey<Long> VmJobCheckInterval = new ConfigKey<Long>("Advanced",
@ -2944,12 +2944,14 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
if (cmd instanceof PingRoutingCommand) {
PingRoutingCommand ping = (PingRoutingCommand)cmd;
if (ping.getNewStates() != null && ping.getNewStates().size() > 0) {
Commands commands = deltaHostSync(agentId, ping.getNewStates());
if (commands.size() > 0) {
try {
_agentMgr.send(agentId, commands, this);
} catch (final AgentUnavailableException e) {
s_logger.warn("Agent is now unavailable", e);
if (!VmJobEnabled.value()) {
Commands commands = deltaHostSync(agentId, ping.getNewStates());
if (commands.size() > 0) {
try {
_agentMgr.send(agentId, commands, this);
} catch (final AgentUnavailableException e) {
s_logger.warn("Agent is now unavailable", e);
}
}
}
}
@ -3973,6 +3975,8 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
s_logger.warn("VM " + vmId + " no longer exists when processing VM state report");
}
} else {
s_logger.info("There is pending job working on the VM. vm id: " + vmId + ", postpone power-change report by resetting power-change counters");
// reset VM power state tracking so that we won't lost signal when VM has
// been translated to
_vmDao.resetVmPowerStateTracking(vmId);
@ -3981,19 +3985,23 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
private void handlePowerOnReportWithNoPendingJobsOnVM(VMInstanceVO vm) {
//
// 1) handle left-over transitional VM states
// 1) handle left-over transitional VM states
// 2) handle out of band VM live migration
// 3) handle out of sync stationary states, marking VM from Stopped to Running with
// alert messages
//
switch (vm.getState()) {
case Starting:
s_logger.info("VM " + vm.getInstanceName() + " is at " + vm.getState() + " and we received a power-on report while there is no pending jobs on it");
try {
stateTransitTo(vm, VirtualMachine.Event.FollowAgentPowerOnReport, vm.getPowerHostId());
} catch (NoTransitionException e) {
s_logger.warn("Unexpected VM state transition exception, race-condition?", e);
}
s_logger.info("VM " + vm.getInstanceName() + " is sync-ed to at Running state according to power-on report from hypervisor");
// we need to alert admin or user about this risky state transition
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(),
VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName()
@ -4008,10 +4016,13 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
} catch (NoTransitionException e) {
s_logger.warn("Unexpected VM state transition exception, race-condition?", e);
}
break;
case Stopping:
case Stopped:
s_logger.info("VM " + vm.getInstanceName() + " is at " + vm.getState() + " and we received a power-on report while there is no pending jobs on it");
try {
stateTransitTo(vm, VirtualMachine.Event.FollowAgentPowerOnReport, vm.getPowerHostId());
} catch (NoTransitionException e) {
@ -4020,6 +4031,8 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(),
VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName() + ") state is sync-ed (" + vm.getState()
+ " -> Running) from out-of-context transition. VM network environment may need to be reset");
s_logger.info("VM " + vm.getInstanceName() + " is sync-ed to at Running state according to power-on report from hypervisor");
break;
case Destroyed:
@ -4029,11 +4042,13 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
break;
case Migrating:
s_logger.info("VM " + vm.getInstanceName() + " is at " + vm.getState() + " and we received a power-on report while there is no pending jobs on it");
try {
stateTransitTo(vm, VirtualMachine.Event.FollowAgentPowerOnReport, vm.getPowerHostId());
} catch (NoTransitionException e) {
s_logger.warn("Unexpected VM state transition exception, race-condition?", e);
}
s_logger.info("VM " + vm.getInstanceName() + " is sync-ed to at Running state according to power-on report from hypervisor");
break;
case Error:
@ -4046,7 +4061,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
private void handlePowerOffReportWithNoPendingJobsOnVM(VMInstanceVO vm) {
// 1) handle left-over transitional VM states
// 1) handle left-over transitional VM states
// 2) handle out of sync stationary states, schedule force-stop to release resources
//
switch (vm.getState()) {
@ -4055,14 +4070,19 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
case Running:
case Stopped:
case Migrating:
s_logger.info("VM " + vm.getInstanceName() + " is at " + vm.getState() + " and we received a power-off report while there is no pending jobs on it");
try {
stateTransitTo(vm, VirtualMachine.Event.FollowAgentPowerOffReport, vm.getPowerHostId());
} catch (NoTransitionException e) {
s_logger.warn("Unexpected VM state transition exception, race-condition?", e);
}
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(),
VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName() + ") state is sync-ed (" + vm.getState()
+ " -> Stopped) from out-of-context transition.");
s_logger.info("VM " + vm.getInstanceName() + " is sync-ed to at Stopped state according to power-on report from hypervisor");
// TODO: we need to forcely release all resource allocation
break;

View File

@ -132,8 +132,8 @@ import com.cloud.vm.DomainRouterVO;
public class VmwareManagerImpl extends ManagerBase implements VmwareManager, VmwareStorageMount, Listener, VmwareDatacenterService {
private static final Logger s_logger = Logger.getLogger(VmwareManagerImpl.class);
private static final int STARTUP_DELAY = 60000; // 60 seconds
private static final long DEFAULT_HOST_SCAN_INTERVAL = 600000; // every 10 minutes
private static final int STARTUP_DELAY = 60000; // 60 seconds
private static final long DEFAULT_HOST_SCAN_INTERVAL = 600000; // every 10 minutes
private long _hostScanInterval = DEFAULT_HOST_SCAN_INTERVAL;
int _timeout;
@ -173,7 +173,7 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw
String _managemetPortGroupName;
String _defaultSystemVmNicAdapterType = VirtualEthernetCardType.E1000.toString();
String _recycleHungWorker = "false";
long _hungWorkerTimeout = 7200000; // 2 hour
long _hungWorkerTimeout = 7200000; // 2 hour
int _additionalPortRangeStart;
int _additionalPortRangeSize;
int _routerExtraPublicNics = 2;
@ -296,10 +296,10 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw
if(_recycleHungWorker == null || _recycleHungWorker.isEmpty()) {
_recycleHungWorker = "false";
}
value = _configDao.getValue(Config.VmwareHungWorkerTimeout.key());
if(value != null)
_hungWorkerTimeout = Long.parseLong(value) * 1000;
_hungWorkerTimeout = Long.parseLong(value) * 1000;
_rootDiskController = _configDao.getValue(Config.VmwareRootDiskControllerType.key());
if(_rootDiskController == null || _rootDiskController.isEmpty()) {
@ -459,7 +459,7 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw
hostSpec.setUserName(userName);
hostSpec.setPassword(password);
hostSpec.setHostName(host);
hostSpec.setForce(true); // forcely take over the host
hostSpec.setForce(true); // forcely take over the host
ManagedObjectReference morTask = serviceContext.getService().addHostTask(morCluster, hostSpec, true, null, null);
boolean taskResult = vclient.waitForTask(morTask);
@ -547,49 +547,49 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw
public void gcLeftOverVMs(VmwareContext context) {
VmwareCleanupMaid.gcLeftOverVMs(context);
}
@Override
public boolean needRecycle(String workerTag) {
if(s_logger.isInfoEnabled())
s_logger.info("Check to see if a worker VM with tag " + workerTag + " needs to be recycled");
if(workerTag == null || workerTag.isEmpty()) {
s_logger.error("Invalid worker VM tag " + workerTag);
return false;
}
String tokens[] = workerTag.split("-");
if(tokens.length != 3) {
s_logger.error("Invalid worker VM tag " + workerTag);
return false;
}
long startTick = Long.parseLong(tokens[0]);
long msid = Long.parseLong(tokens[1]);
long runid = Long.parseLong(tokens[2]);
if (s_logger.isInfoEnabled())
s_logger.info("Check to see if a worker VM with tag " + workerTag + " needs to be recycled");
if (workerTag == null || workerTag.isEmpty()) {
s_logger.error("Invalid worker VM tag " + workerTag);
return false;
}
String tokens[] = workerTag.split("-");
if (tokens.length != 3) {
s_logger.error("Invalid worker VM tag " + workerTag);
return false;
}
long startTick = Long.parseLong(tokens[0]);
long msid = Long.parseLong(tokens[1]);
long runid = Long.parseLong(tokens[2]);
if(_mshostPeerDao.countStateSeenInPeers(msid, runid, ManagementServerHost.State.Down) > 0) {
if(s_logger.isInfoEnabled())
s_logger.info("Worker VM's owner management server node has been detected down from peer nodes, recycle it");
return true;
if (s_logger.isInfoEnabled())
s_logger.info("Worker VM's owner management server node has been detected down from peer nodes, recycle it");
return true;
}
if(msid == _clusterMgr.getManagementNodeId() && runid != _clusterMgr.getCurrentRunId()) {
if(s_logger.isInfoEnabled())
s_logger.info("Worker VM's owner management server has changed runid, recycle it");
return true;
if (s_logger.isInfoEnabled())
s_logger.info("Worker VM's owner management server has changed runid, recycle it");
return true;
}
// disable time-out check until we have found out a VMware API that can check if
// there are pending tasks on the subject VM
/*
if(System.currentTimeMillis() - startTick > _hungWorkerTimeout) {
if(s_logger.isInfoEnabled())
s_logger.info("Worker VM expired, seconds elapsed: " + (System.currentTimeMillis() - startTick) / 1000);
return true;
}
*/
return false;
/*
if(System.currentTimeMillis() - startTick > _hungWorkerTimeout) {
if(s_logger.isInfoEnabled())
s_logger.info("Worker VM expired, seconds elapsed: " + (System.currentTimeMillis() - startTick) / 1000);
return true;
}
*/
return false;
}
@Override
@ -619,6 +619,9 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw
", destination: " + destIso.getAbsolutePath());
try {
FileUtil.copyfile(srcIso, destIso);
s_logger.info("System VM patch ISO file is copied to secondary storage. source ISO: " + srcIso.getAbsolutePath() +
", destination: " + destIso.getAbsolutePath());
} catch(IOException e) {
s_logger.error("Unexpected exception ", e);

View File

@ -333,7 +333,7 @@ public class VolumeApiServiceImpl extends ManagerBase implements VolumeApiServic
// TODO
static final ConfigKey<Boolean> VmJobEnabled = new ConfigKey<Boolean>("Advanced",
Boolean.class, "vm.job.enabled", "false",
Boolean.class, "vm.job.enabled", "true",
"True to enable new VM sync model. false to use the old way", false);
static final ConfigKey<Long> VmJobCheckInterval = new ConfigKey<Long>("Advanced",
Long.class, "vm.job.check.interval", "3000",

View File

@ -136,7 +136,7 @@ public class VMSnapshotManagerImpl extends ManagerBase implements VMSnapshotMana
// TODO
static final ConfigKey<Boolean> VmJobEnabled = new ConfigKey<Boolean>("Advanced",
Boolean.class, "vm.job.enabled", "false",
Boolean.class, "vm.job.enabled", "true",
"True to enable new VM sync model. false to use the old way", false);
static final ConfigKey<Long> VmJobCheckInterval = new ConfigKey<Long>("Advanced",
Long.class, "vm.job.check.interval", "3000",