Fix host stuck in connecting state (#375)

* Fix host stuck in connecting state (#8502)

There are a lot of test failures due to test_vm_life_cycle.py in multiple PRs due to host not available for migration of VMs.
#8438 (comment)
#8433 (comment)
#7344 (comment)

While debugging I noticed that the hosts get stuck in Connecting state because MS is waiting for a response of the ReadyCommand from the agent. Since we take a lock on connection and disconnection, restarting the agent doesn't work. To fix this, we have to restart the MS or wait for ~1 hour (default timeout).

On the agent side, it gets stuck waiting for a response from the Script execution.

To reproduce, run smoke/test_vm_life_cycle.py (TestSecuredVmMigration test class to be specific). Once the tests are complete, you will notice that some hosts are stuck in Connecting state. And restarting the agent fails due to the named lock. Locks on DB can be checked using the below query.

SELECT *
FROM performance_schema.metadata_locks
INNER JOIN performance_schema.threads ON THREAD_ID = OWNER_THREAD_ID
WHERE PROCESSLIST_ID <> CONNECTION_ID() \G;

This PR adds a wait for the ready command and a timeout to the Script execution to ensure that the thread doesn't get stuck and the named lock from database is released.

* Externalise a few timeouts & fix timeout for hostSupportsUefi in libvirt ready command wrapper (#8547)

This PR fixes bug introduced in #8502. Timeout for script execution was set to 60 ms instead of 60s which resulted in host not getting UEFI enabled. This is a blocker for 4.19 release.

We do this by introducing a new agent parameter `agent.script.timeout` (default - 60 seconds) to use as a timeout for the script checking host's UEFI status.

We also externalize the timeout for the ReadyCommand by introducing a new global setting `ready.command.wait` (default - 60 seconds).

For ModifyStoragePoolCommand, we don't externalize the timeout to avoid confusion for the user. Since, the required timeout can vary depending on the provider in use and we are only setting the wait for default host listener for now. Instead, we reuse the global `wait` setting by dividing it by `5` making the default value of 6 minutes (1800/5 = 360s) for ModifyStoragePoolCommand.

Note: the actual time, the MS waits is twice the wait set for a Command. Check reference code below.
19250403e6/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentAttache.java (L406-L442)

* fixup
This commit is contained in:
Vishesh 2024-02-21 13:44:53 +05:30 committed by GitHub
parent 89f93746ac
commit f30e07b312
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 44 additions and 6 deletions

View File

@ -5,9 +5,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@ -378,6 +378,10 @@ iscsi.session.cleanup.enabled=false
# If the time is exceeded shutdown will be forced.
#stop.script.timeout=120
# Time (in seconds) to wait for scripts to complete.
# This is currently used only while checking if the host supports UEFI.
#agent.script.timeout=60
# Definition of VMs video model type.
#vm.video.hardware=

View File

@ -657,6 +657,14 @@ public class AgentProperties{
*/
public static final Property<Integer> STOP_SCRIPT_TIMEOUT = new Property<>("stop.script.timeout", 120);
/**
* Time (in seconds) to wait for scripts to complete.<br>
* This is currently used only while checking if the host supports UEFI.<br>
* Data type: Integer.<br>
* Default value: <code>60</code>
*/
public static final Property<Integer> AGENT_SCRIPT_TIMEOUT = new Property<>("agent.script.timeout", 60);
/**
* Definition of VMs video model type.<br>
* Data type: String.<br>

View File

@ -47,6 +47,9 @@ public interface AgentManager {
"according to the hosts health check results",
true, ConfigKey.Scope.Cluster, null);
ConfigKey<Integer> ReadyCommandWait = new ConfigKey<Integer>("Advanced", Integer.class, "ready.command.wait",
"60", "Time in seconds to wait for Ready command to return", true);
public enum TapAgentsAction {
Add, Del, Contains,
}

View File

@ -596,6 +596,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
final Long dcId = host.getDataCenterId();
final ReadyCommand ready = new ReadyCommand(dcId, host.getId(), NumbersUtil.enableHumanReadableSizes);
ready.setWait(ReadyCommandWait.value());
final Answer answer = easySend(hostId, ready);
if (answer == null || !answer.getResult()) {
// this is tricky part for secondary storage
@ -1836,7 +1837,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
@Override
public ConfigKey<?>[] getConfigKeys() {
return new ConfigKey<?>[] { CheckTxnBeforeSending, Workers, Port, Wait, AlertWait, DirectAgentLoadSize,
DirectAgentPoolSize, DirectAgentThreadCap, EnableKVMAutoEnableDisable };
DirectAgentPoolSize, DirectAgentThreadCap, EnableKVMAutoEnableDisable, ReadyCommandWait };
}
protected class SetHostParamsListener implements Listener {

View File

@ -57,6 +57,12 @@ import java.util.List;
public class DefaultHostListener implements HypervisorHostListener {
private static final Logger s_logger = Logger.getLogger(DefaultHostListener.class);
/**
* Wait time for modify storage pool command to complete. We should wait for 5 minutes for the command to complete.
* This should ideally be externalised as a global configuration parameter in the future (See #8506).
**/
private final int modifyStoragePoolCommandWait = 300; // 5 minutes
@Inject
AgentManager agentMgr;
@Inject
@ -84,7 +90,6 @@ public class DefaultHostListener implements HypervisorHostListener {
@Inject
NetworkDao networkDao;
@Override
public boolean hostAdded(long hostId) {
return true;
@ -121,6 +126,9 @@ public class DefaultHostListener implements HypervisorHostListener {
public boolean hostConnect(long hostId, long poolId) throws StorageConflictException {
StoragePool pool = (StoragePool) this.dataStoreMgr.getDataStore(poolId, DataStoreRole.Primary);
ModifyStoragePoolCommand cmd = new ModifyStoragePoolCommand(true, pool);
cmd.setWait(modifyStoragePoolCommandWait);
s_logger.debug(String.format("Sending modify storage pool command to agent: %d for storage pool: %d with timeout %d seconds",
hostId, poolId, cmd.getWait()));
final Answer answer = agentMgr.easySend(hostId, cmd);
if (answer == null) {

View File

@ -25,6 +25,8 @@ import java.util.Map;
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.ReadyAnswer;
import com.cloud.agent.api.ReadyCommand;
import com.cloud.agent.properties.AgentProperties;
import com.cloud.agent.properties.AgentPropertiesFileHandler;
import com.cloud.host.Host;
import com.cloud.hypervisor.kvm.resource.LibvirtComputingResource;
import com.cloud.resource.CommandWrapper;
@ -51,11 +53,12 @@ public final class LibvirtReadyCommandWrapper extends CommandWrapper<ReadyComman
private boolean hostSupportsUefi(boolean isUbuntuHost) {
String cmd = "rpm -qa | grep -i ovmf";
int timeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.AGENT_SCRIPT_TIMEOUT) * 1000; // Get property value & convert to milliseconds
if (isUbuntuHost) {
cmd = "dpkg -l ovmf";
}
s_logger.debug("Running command : " + cmd);
int result = Script.runSimpleBashScriptForExitValue(cmd);
s_logger.debug("Running command : [" + cmd + "] with timeout : " + timeout + " ms");
int result = Script.runSimpleBashScriptForExitValue(cmd, timeout);
s_logger.debug("Got result : " + result);
return result == 0;
}

View File

@ -511,6 +511,17 @@ public class Script implements Callable<String> {
return runSimpleBashScriptForExitValue(command, 0);
}
/**
* Executes a bash script and returns the exit value of the script.
*
* @param command
* The bash command to be executed.
* @param timeout
* The maximum time (in milliseconds) that the script is allowed to run before it is forcibly terminated.
*
* @return The exit value of the script. Returns -1 if the result is null or empty, or if it cannot be parsed into
* an integer which can happen in case of a timeout.
*/
public static int runSimpleBashScriptForExitValue(String command, int timeout) {
Script s = new Script("/bin/bash", timeout);