CLOUDSTACK-9112: deployVM thread is holding the global lock on network longer and cause delays and some improvements in the planner

There are some VM deployment failures happening when multiple VMs are deployed at a time, failures mainly due to NetworkModel code that iterates over all the vlans in the pod. This causes each deployVM thread to hold the global lock on Network longer and cause delays. This delay in turn causes more threads to choose same host and fail since capacity is not available on that host. Following are some changes required to be done to reduce delays during VM deployments which in turn causes some vm deployment failures when multiple VMs are launched at a time. In Planner, remove the clusters that do not contain a host with matching service offering tag. This will save some iterations over clusters that dont have matching tagged host In NetworkModel, do not query the vlans for the pod within the loop. Also optimized the logic to query the ip/ipv6 In DeploymentPlanningManagerImpl, do not process the affinity group if the plan has hostId provided.
2017-03-01 14:30:51 +05:30 · 2017-03-01 14:30:51 +05:30 · f34469a41b
parent 1decf5366d
commit f34469a41b
6 changed files with 130 additions and 48 deletions
--- a/engine/schema/src/com/cloud/host/dao/HostDao.java
+++ b/engine/schema/src/com/cloud/host/dao/HostDao.java
@ -98,5 +98,7 @@ public interface HostDao extends GenericDao<HostVO, Long>, StateDao<Status, Stat

    HostVO findByPublicIp(String publicIp);

+    List<Long> listClustersByHostTag(String hostTagOnOffering);
+
    List<HostVO> listByType(Type type);
 }
--- a/engine/schema/src/com/cloud/host/dao/HostDaoImpl.java
+++ b/engine/schema/src/com/cloud/host/dao/HostDaoImpl.java
@ -77,6 +77,8 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements HostDao
    private static final Logger status_logger = Logger.getLogger(Status.class);
    private static final Logger state_logger = Logger.getLogger(ResourceState.class);

+    private static final String LIST_CLUSTERID_FOR_HOST_TAG = "select distinct cluster_id from host join host_tags on host.id = host_tags.host_id and host_tags.tag = ?";
+
    protected SearchBuilder<HostVO> TypePodDcStatusSearch;

    protected SearchBuilder<HostVO> IdStatusSearch;
@ -1129,6 +1131,29 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements HostDao
        return customSearch(sc, null);
    }

+    @Override
+    public List<Long> listClustersByHostTag(String hostTagOnOffering) {
+        TransactionLegacy txn = TransactionLegacy.currentTxn();
+        PreparedStatement pstmt = null;
+        List<Long> result = new ArrayList<Long>();
+        StringBuilder sql = new StringBuilder(LIST_CLUSTERID_FOR_HOST_TAG);
+        // during listing the clusters that cross the threshold
+        // we need to check with disabled thresholds of each cluster if not defined at cluster consider the global value
+        try {
+            pstmt = txn.prepareAutoCloseStatement(sql.toString());
+            pstmt.setString(1, hostTagOnOffering);
+            ResultSet rs = pstmt.executeQuery();
+            while (rs.next()) {
+                result.add(rs.getLong(1));
+            }
+            return result;
+        } catch (SQLException e) {
+            throw new CloudRuntimeException("DB Exception on: " + sql, e);
+        } catch (Throwable e) {
+            throw new CloudRuntimeException("Caught: " + sql, e);
+        }
+    }
+
    @Override
    public List<HostVO> listAllHostsByType(Host.Type type) {
        SearchCriteria<HostVO> sc = TypeSearch.create();
--- a/server/src/com/cloud/deploy/DeploymentPlanningManagerImpl.java
+++ b/server/src/com/cloud/deploy/DeploymentPlanningManagerImpl.java
@ -249,50 +249,16 @@ StateListener<State, VirtualMachine.Event, VirtualMachine> {
    public DeployDestination planDeployment(VirtualMachineProfile vmProfile, DeploymentPlan plan, ExcludeList avoids, DeploymentPlanner planner)
            throws InsufficientServerCapacityException, AffinityConflictException {

-        // call affinitygroup chain
+        ServiceOffering offering = vmProfile.getServiceOffering();
+        int cpu_requested = offering.getCpu() * offering.getSpeed();
+        long ram_requested = offering.getRamSize() * 1024L * 1024L;
        VirtualMachine vm = vmProfile.getVirtualMachine();
-        long vmGroupCount = _affinityGroupVMMapDao.countAffinityGroupsForVm(vm.getId());
        DataCenter dc = _dcDao.findById(vm.getDataCenterId());

-        if (vmGroupCount > 0) {
-            for (AffinityGroupProcessor processor : _affinityProcessors) {
-                processor.process(vmProfile, plan, avoids);
-            }
-        }

        if (vm.getType() == VirtualMachine.Type.User || vm.getType() == VirtualMachine.Type.DomainRouter) {
            checkForNonDedicatedResources(vmProfile, dc, avoids);
        }
-        if (s_logger.isDebugEnabled()) {
-            s_logger.debug("Deploy avoids pods: " + avoids.getPodsToAvoid() + ", clusters: " + avoids.getClustersToAvoid() + ", hosts: " + avoids.getHostsToAvoid());
-        }
-
-        // call planners
-        //DataCenter dc = _dcDao.findById(vm.getDataCenterId());
-        // check if datacenter is in avoid set
-        if (avoids.shouldAvoid(dc)) {
-            if (s_logger.isDebugEnabled()) {
-                s_logger.debug("DataCenter id = '" + dc.getId() + "' provided is in avoid set, DeploymentPlanner cannot allocate the VM, returning.");
-            }
-            return null;
-        }
-
-        ServiceOffering offering = vmProfile.getServiceOffering();
-        if(planner == null){
-            String plannerName = offering.getDeploymentPlanner();
-            if (plannerName == null) {
-                if (vm.getHypervisorType() == HypervisorType.BareMetal) {
-                    plannerName = "BareMetalPlanner";
-                } else {
-                    plannerName = _configDao.getValue(Config.VmDeploymentPlanner.key());
-                }
-            }
-            planner = getDeploymentPlannerByName(plannerName);
-        }
-
-        int cpu_requested = offering.getCpu() * offering.getSpeed();
-        long ram_requested = offering.getRamSize() * 1024L * 1024L;
-
        if (s_logger.isDebugEnabled()) {
            s_logger.debug("DeploymentPlanner allocation algorithm: " + planner);

@ -364,6 +330,44 @@ StateListener<State, VirtualMachine.Event, VirtualMachine> {
            return null;
        }

+        // call affinitygroup chain
+        long vmGroupCount = _affinityGroupVMMapDao.countAffinityGroupsForVm(vm.getId());
+
+        if (vmGroupCount > 0) {
+            for (AffinityGroupProcessor processor : _affinityProcessors) {
+                processor.process(vmProfile, plan, avoids);
+            }
+        }
+
+        if (vm.getType() == VirtualMachine.Type.User) {
+            checkForNonDedicatedResources(vmProfile, dc, avoids);
+        }
+        if (s_logger.isDebugEnabled()) {
+            s_logger.debug("Deploy avoids pods: " + avoids.getPodsToAvoid() + ", clusters: " + avoids.getClustersToAvoid() + ", hosts: " + avoids.getHostsToAvoid());
+        }
+
+        // call planners
+        // DataCenter dc = _dcDao.findById(vm.getDataCenterId());
+        // check if datacenter is in avoid set
+        if (avoids.shouldAvoid(dc)) {
+            if (s_logger.isDebugEnabled()) {
+                s_logger.debug("DataCenter id = '" + dc.getId() + "' provided is in avoid set, DeploymentPlanner cannot allocate the VM, returning.");
+            }
+            return null;
+        }
+
+        if (planner == null) {
+            String plannerName = offering.getDeploymentPlanner();
+            if (plannerName == null) {
+                if (vm.getHypervisorType() == HypervisorType.BareMetal) {
+                    plannerName = "BareMetalPlanner";
+                } else {
+                    plannerName = _configDao.getValue(Config.VmDeploymentPlanner.key());
+                }
+            }
+            planner = getDeploymentPlannerByName(plannerName);
+        }
+
        if (vm.getLastHostId() != null && haVmTag == null) {
            s_logger.debug("This VM has last host_id specified, trying to choose the same host: " + vm.getLastHostId());

--- a/server/src/com/cloud/deploy/FirstFitPlanner.java
+++ b/server/src/com/cloud/deploy/FirstFitPlanner.java
@ -393,6 +393,10 @@ public class FirstFitPlanner extends AdapterBase implements DeploymentClusterPla
            }

            removeClustersCrossingThreshold(prioritizedClusterIds, avoid, vmProfile, plan);
+            String hostTagOnOffering = offering.getHostTag();
+            if (hostTagOnOffering != null) {
+                removeClustersWithoutMatchingTag(prioritizedClusterIds, hostTagOnOffering);
+            }

        } else {
            if (s_logger.isDebugEnabled()) {
@ -520,6 +524,18 @@ public class FirstFitPlanner extends AdapterBase implements DeploymentClusterPla

    }

+    private void removeClustersWithoutMatchingTag(List<Long> clusterListForVmAllocation, String hostTagOnOffering) {
+
+        List<Long> matchingClusters = hostDao.listClustersByHostTag(hostTagOnOffering);
+
+        clusterListForVmAllocation.retainAll(matchingClusters);
+
+        if (s_logger.isDebugEnabled()) {
+            s_logger.debug("The clusterId list for the given offering tag: " + clusterListForVmAllocation);
+        }
+
+    }
+
    private boolean isRootAdmin(VirtualMachineProfile vmProfile) {
        if (vmProfile != null) {
            if (vmProfile.getOwner() != null) {
--- a/server/src/com/cloud/network/NetworkModelImpl.java
+++ b/server/src/com/cloud/network/NetworkModelImpl.java
@ -2217,24 +2217,29 @@ public class NetworkModelImpl extends ManagerBase implements NetworkModel {
    @Override
    public NicVO getPlaceholderNicForRouter(Network network, Long podId) {
        List<NicVO> nics = _nicDao.listPlaceholderNicsByNetworkIdAndVmType(network.getId(), VirtualMachine.Type.DomainRouter);
+        List<? extends Vlan> vlans = new ArrayList<VlanVO>();
+        if (podId != null) {
+            vlans = _vlanDao.listVlansForPod(podId);
+        }
        for (NicVO nic : nics) {
            if (nic.getReserver() == null && (nic.getIPv4Address() != null || nic.getIPv6Address() != null)) {
                if (podId == null) {
                    return nic;
                } else {
+                    IpAddress ip = null;
+                    UserIpv6AddressVO ipv6 = null;
+
+                    if (nic.getIPv4Address() != null) {
+                        ip = _ipAddressDao.findByIpAndSourceNetworkId(network.getId(), nic.getIPv4Address());
+                    } else {
+                        ipv6 = _ipv6Dao.findByNetworkIdAndIp(network.getId(), nic.getIPv6Address());
+                    }
                    //return nic only when its ip address belong to the pod range (for the Basic zone case)
-                    List<? extends Vlan> vlans = _vlanDao.listVlansForPod(podId);
                    for (Vlan vlan : vlans) {
-                        if (nic.getIPv4Address() != null) {
-                            IpAddress ip = _ipAddressDao.findByIpAndSourceNetworkId(network.getId(), nic.getIPv4Address());
-                            if (ip != null && ip.getVlanId() == vlan.getId()) {
-                                return nic;
-                            }
-                        } else {
-                            UserIpv6AddressVO ipv6 = _ipv6Dao.findByNetworkIdAndIp(network.getId(), nic.getIPv6Address());
-                            if (ipv6 != null && ipv6.getVlanId() == vlan.getId()) {
-                                return nic;
-                            }
+                        if (ip != null && ip.getVlanId() == vlan.getId()) {
+                            return nic;
+                        } else if (ipv6 != null && ipv6.getVlanId() == vlan.getId()) {
+                            return nic;
                        }
                    }
                }
--- a/server/test/com/cloud/vm/FirstFitPlannerTest.java
+++ b/server/test/com/cloud/vm/FirstFitPlannerTest.java
@ -30,6 +30,7 @@ import java.util.Map;

 import javax.inject.Inject;

+import com.cloud.offering.ServiceOffering;
 import org.apache.cloudstack.context.CallContext;
 import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreManager;
 import org.apache.cloudstack.framework.config.ConfigDepot;
@ -127,6 +128,8 @@ public class FirstFitPlannerTest {
    ConfigDepotImpl configDepot;
    @Inject
    ScopedConfigStorage scopedStorage;
+    @Inject
+    HostDao hostDao;

    private static long domainId = 1L;
    long dataCenterId = 1L;
@ -192,6 +195,33 @@ public class FirstFitPlannerTest {
        assertTrue("Reordered cluster list have clusters exceeding threshold", (!clusterList.containsAll(clustersCrossingThreshold)));
    }

+    @Test
+    public void checkClusterListBasedOnHostTag() throws InsufficientServerCapacityException {
+        VirtualMachineProfileImpl vmProfile = mock(VirtualMachineProfileImpl.class);
+        DataCenterDeployment plan = mock(DataCenterDeployment.class);
+        ExcludeList avoids = mock(ExcludeList.class);
+        initializeForTest(vmProfile, plan, avoids);
+        List<Long> matchingClusters = initializeForClusterListBasedOnHostTag(vmProfile.getServiceOffering());
+
+        List<Long> clusterList = planner.orderClusters(vmProfile, plan, avoids);
+
+        assertTrue("Reordered cluster list have clusters which has hosts with specified host tag on offering", (clusterList.containsAll(matchingClusters)));
+        assertTrue("Reordered cluster list does not have clusters which dont have hosts with matching host tag on offering", (!clusterList.contains(2L)));
+    }
+
+    private List<Long> initializeForClusterListBasedOnHostTag(ServiceOffering offering) {
+
+
+        when(offering.getHostTag()).thenReturn("hosttag1");
+        initializeForClusterThresholdDisabled();
+        List<Long> matchingClusters = new ArrayList<>();
+        matchingClusters.add(3L);
+        matchingClusters.add(5L);
+        when(hostDao.listClustersByHostTag("hosttag1")).thenReturn(matchingClusters);
+
+        return matchingClusters;
+    }
+
    @Test
    public void checkClusterReorderingForStartVMWithThresholdCheckDisabled() throws InsufficientServerCapacityException {
        VirtualMachineProfileImpl vmProfile = mock(VirtualMachineProfileImpl.class);