mirror of https://github.com/apache/cloudstack.git
Bug 11404 - VM was in Running state, had null for a pod_id, basically didnt allow creation of subsequent vm's
Reviewed-by: Alex Changes: - When management server starts, it goes through all the pending work items from op_it_work table and schedules HA work for each. It used to mark each item as done. Instead we should keep the item as pending and let it get marked as Done after the HA work is done. - Changes in VirtualMachineMgr::advanceStop() : a) if we find a VM with null hostId, we stop the VM only if it is forced stopped. b) if VM state transition to Stopping fails,for state Starting and Migrating we try to find the pending work item and then do cleanup the VM. In case state is Stopping we can cleanup directly. c) We proceed releasing all resources only if state transitioned to 'Stopping'. - Changes in HA: a) Depend on VirtualMachineMgr::advanceStop() in case host is not found to do VM cleanup - When Vm state between mgmt server and agent syncs from starting -> running, mark any pending work item as done.
This commit is contained in:
parent
180b8dc8f3
commit
13cfa9660d
|
|
@ -2058,6 +2058,9 @@ public class AgentManagerImpl implements AgentManager, HandlerFactory, Manager {
|
|||
boolean doCidrCheck = true;
|
||||
|
||||
ClusterVO clusterVO = _clusterDao.findById(clusterId);
|
||||
if(clusterVO == null){
|
||||
throw new IllegalArgumentException("Cluster with id " + clusterId + " does not exist, cannot add host with this clusterId");
|
||||
}
|
||||
if (clusterVO.getHypervisorType() != scc.getHypervisorType()) {
|
||||
throw new IllegalArgumentException("Can't add host whose hypervisor type is: " + scc.getHypervisorType() + " into cluster: " + clusterId + " whose hypervisor type is: "
|
||||
+ clusterVO.getHypervisorType());
|
||||
|
|
|
|||
|
|
@ -68,10 +68,8 @@ import com.cloud.utils.component.ComponentLocator;
|
|||
import com.cloud.utils.component.Inject;
|
||||
import com.cloud.utils.concurrency.NamedThreadFactory;
|
||||
import com.cloud.utils.exception.CloudRuntimeException;
|
||||
import com.cloud.utils.fsm.NoTransitionException;
|
||||
import com.cloud.vm.VMInstanceVO;
|
||||
import com.cloud.vm.VirtualMachine;
|
||||
import com.cloud.vm.VirtualMachine.Event;
|
||||
import com.cloud.vm.VirtualMachine.State;
|
||||
import com.cloud.vm.VirtualMachineManager;
|
||||
import com.cloud.vm.VirtualMachineProfile;
|
||||
|
|
@ -269,9 +267,17 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu
|
|||
if (hostId == null) {
|
||||
try {
|
||||
s_logger.debug("Found a vm that is scheduled to be restarted but has no host id: " + vm);
|
||||
_itMgr.stateTransitTo(vm, Event.OperationFailed, null);
|
||||
} catch (NoTransitionException e) {
|
||||
}
|
||||
_itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount());
|
||||
} catch (ResourceUnavailableException e) {
|
||||
assert false : "How do we hit this when force is true?";
|
||||
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
|
||||
} catch (OperationTimedoutException e) {
|
||||
assert false : "How do we hit this when force is true?";
|
||||
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
|
||||
} catch (ConcurrentOperationException e) {
|
||||
assert false : "How do we hit this when force is true?";
|
||||
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -300,12 +306,15 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu
|
|||
try {
|
||||
_itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount());
|
||||
} catch (ResourceUnavailableException e) {
|
||||
// FIXME
|
||||
assert false : "How do we hit this when force is true?";
|
||||
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
|
||||
} catch (OperationTimedoutException e) {
|
||||
// FIXME
|
||||
assert false : "How do we hit this when force is true?";
|
||||
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
|
||||
} catch (ConcurrentOperationException e) {
|
||||
// FIXME
|
||||
}
|
||||
assert false : "How do we hit this when force is true?";
|
||||
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
|
||||
}
|
||||
}
|
||||
|
||||
List<HaWorkVO> items = _haDao.findPreviousHA(vm.getId());
|
||||
|
|
@ -454,7 +463,19 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu
|
|||
work.setStep(Step.Scheduled);
|
||||
_haDao.update(work.getId(), work);
|
||||
} else {
|
||||
assert false : "How come that HA step is Investigating and the host is removed?";
|
||||
s_logger.debug("How come that HA step is Investigating and the host is removed? Calling forced Stop on Vm anyways");
|
||||
try {
|
||||
_itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount());
|
||||
} catch (ResourceUnavailableException e) {
|
||||
assert false : "How do we hit this when force is true?";
|
||||
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
|
||||
} catch (OperationTimedoutException e) {
|
||||
assert false : "How do we hit this when force is true?";
|
||||
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
|
||||
} catch (ConcurrentOperationException e) {
|
||||
assert false : "How do we hit this when force is true?";
|
||||
throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -594,12 +615,6 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu
|
|||
s_logger.info("Stopping " + vm);
|
||||
try {
|
||||
if (work.getWorkType() == WorkType.Stop) {
|
||||
if (vm.getHostId() == null) {
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug(vm.toString() + " has already been stopped");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
if (_itMgr.advanceStop(vm, false, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount())) {
|
||||
s_logger.info("Successfully stopped " + vm);
|
||||
return null;
|
||||
|
|
|
|||
|
|
@ -132,6 +132,10 @@ public class ItWorkVO {
|
|||
return managementServerId;
|
||||
}
|
||||
|
||||
public void setManagementServerId(long managementServerId) {
|
||||
this.managementServerId = managementServerId;
|
||||
}
|
||||
|
||||
public State getType() {
|
||||
return type;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -924,14 +924,34 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
//grab outstanding work item if any
|
||||
ItWorkVO work = _workDao.findByOutstandingWork(vm.getId(), vm.getState());
|
||||
if(work != null){
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Found an outstanding work item for this vm "+ vm + " with state:"+vm.getState()+", work id:"+work.getId());
|
||||
}
|
||||
}
|
||||
Long hostId = vm.getHostId();
|
||||
if (hostId == null) {
|
||||
if(!forced){
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("HostId is null but this is not a forced stop, cannot stop vm "+ vm + " with state:"+vm.getState());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
stateTransitTo(vm, Event.AgentReportStopped, null, null);
|
||||
} catch (NoTransitionException e) {
|
||||
s_logger.warn(e.getMessage());
|
||||
}
|
||||
//mark outstanding work item if any as done
|
||||
if (work != null) {
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Updating work item to Done, id:"+work.getId());
|
||||
}
|
||||
work.setStep(Step.Done);
|
||||
_workDao.update(work.getId(), work);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -946,22 +966,46 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
|
|||
if (!forced) {
|
||||
throw new CloudRuntimeException("We cannot stop " + vm + " when it is in state " + vm.getState());
|
||||
}
|
||||
s_logger.debug("Unable to transition the state but we're moving on because it's forced stop");
|
||||
if (state == State.Starting || state == State.Stopping || state == State.Migrating) {
|
||||
ItWorkVO work = _workDao.findByOutstandingWork(vm.getId(), vm.getState());
|
||||
boolean doCleanup = false;
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Unable to transition the state but we're moving on because it's forced stop");
|
||||
}
|
||||
if (state == State.Starting || state == State.Migrating) {
|
||||
if (work != null) {
|
||||
if (cleanup(vmGuru, new VirtualMachineProfileImpl<T>(vm), work, Event.StopRequested, forced, user, account)) {
|
||||
try {
|
||||
return stateTransitTo(vm, Event.AgentReportStopped, null);
|
||||
} catch (NoTransitionException e) {
|
||||
s_logger.warn("Unable to cleanup " + vm);
|
||||
return false;
|
||||
}
|
||||
doCleanup = true;
|
||||
}else{
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Unable to cleanup VM: "+ vm+" ,since outstanding work item is not found");
|
||||
}
|
||||
throw new CloudRuntimeException("Work item not found, We cannot stop " + vm + " when it is in state " + vm.getState());
|
||||
}
|
||||
}else if(state == State.Stopping){
|
||||
doCleanup = true;
|
||||
}
|
||||
|
||||
if(doCleanup){
|
||||
if (cleanup(vmGuru, new VirtualMachineProfileImpl<T>(vm), work, Event.StopRequested, forced, user, account)) {
|
||||
try {
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Updating work item to Done, id:"+work.getId());
|
||||
}
|
||||
return changeState(vm, Event.AgentReportStopped, null, work, Step.Done);
|
||||
} catch (NoTransitionException e) {
|
||||
s_logger.warn("Unable to cleanup " + vm);
|
||||
return false;
|
||||
}
|
||||
}else{
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Failed to cleanup VM: "+ vm);
|
||||
}
|
||||
throw new CloudRuntimeException("Failed to cleanup " + vm + " , current state " + vm.getState());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(vm.getState() != State.Stopping){
|
||||
throw new CloudRuntimeException("We cannot proceed with stop VM " + vm + " since it is not in 'Stopping' state, current state: " + vm.getState());
|
||||
}
|
||||
String routerPrivateIp = null;
|
||||
if (vm.getType() == VirtualMachine.Type.DomainRouter) {
|
||||
routerPrivateIp = vm.getPrivateIpAddress();
|
||||
|
|
@ -1017,6 +1061,14 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
|
|||
}
|
||||
|
||||
try {
|
||||
if (work != null) {
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Updating the outstanding work item to Done, id:"+work.getId());
|
||||
}
|
||||
work.setStep(Step.Done);
|
||||
_workDao.update(work.getId(), work);
|
||||
}
|
||||
|
||||
return stateTransitTo(vm, Event.OperationSucceeded, null, null);
|
||||
} catch (NoTransitionException e) {
|
||||
s_logger.warn(e.getMessage());
|
||||
|
|
@ -1262,14 +1314,18 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
|
|||
if (vm != null) {
|
||||
if (work.getType() == State.Starting) {
|
||||
_haMgr.scheduleRestart(vm, true);
|
||||
work.setManagementServerId(_nodeId);
|
||||
_workDao.update(work.getId(), work);
|
||||
} else if (work.getType() == State.Stopping) {
|
||||
_haMgr.scheduleStop(vm, vm.getHostId(), WorkType.CheckStop);
|
||||
work.setManagementServerId(_nodeId);
|
||||
_workDao.update(work.getId(), work);
|
||||
} else if (work.getType() == State.Migrating) {
|
||||
_haMgr.scheduleMigration(vm);
|
||||
work.setStep(Step.Done);
|
||||
_workDao.update(work.getId(), work);
|
||||
}
|
||||
}
|
||||
work.setStep(Step.Done);
|
||||
_workDao.update(work.getId(), work);
|
||||
} catch (Exception e) {
|
||||
s_logger.error("Error while handling " + work, e);
|
||||
}
|
||||
|
|
@ -1693,6 +1749,15 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
|
|||
|
||||
s_logger.debug("VM state is starting on full sync so updating it to running");
|
||||
vm = findById(vm.getType(), vm.getId());
|
||||
|
||||
//grab outstanding work item if any
|
||||
ItWorkVO work = _workDao.findByOutstandingWork(vm.getId(), vm.getState());
|
||||
if(work != null){
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Found an outstanding work item for this vm "+ vm + " in state:"+vm.getState()+", work id:"+work.getId());
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
stateTransitTo(vm, cause, hostId);
|
||||
} catch (NoTransitionException e1) {
|
||||
|
|
@ -1726,6 +1791,14 @@ public class VirtualMachineManagerImpl implements VirtualMachineManager, Listene
|
|||
} else {
|
||||
s_logger.error("Unable to finalize commands on start for vm: " + vm);
|
||||
}
|
||||
|
||||
if (work != null) {
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Updating outstanding work item to Done, id:"+work.getId());
|
||||
}
|
||||
work.setStep(Step.Done);
|
||||
_workDao.update(work.getId(), work);
|
||||
}
|
||||
}
|
||||
|
||||
public Commands fullSync(final long hostId, StartupRoutingCommand startup) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue