Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
7ebc81d
HA Agent check regardless of NFS
GabrielBrascher Jan 22, 2021
44a320a
Add KVM HA Client validation on HA execution flow
GabrielBrascher Jan 25, 2021
895421e
kvm: Add Agent Helper to investigate if a Host is truly down
wido Jan 22, 2021
f97100d
Integrating Agent HA Helper
GabrielBrascher Jan 27, 2021
2e8b563
Count number of VMs running on Define if a host is healthy by countin…
GabrielBrascher Jan 28, 2021
4bafec0
Add global settings configurations; enhance documentation; work on tests
GabrielBrascher Feb 11, 2021
fa998e1
Remove commented lines
GabrielBrascher Apr 29, 2021
ee4c28c
Consider Paused instances to count when a VM is being migrated and al…
GabrielBrascher Apr 30, 2021
6953b95
Final adjustments before calling for review
GabrielBrascher May 3, 2021
35449e8
Refactoring tests and adding small improvements
GabrielBrascher May 5, 2021
0506649
Fix missing parameter on log String.format
GabrielBrascher May 6, 2021
4f7936e
Update code addressing reviewers
GabrielBrascher May 11, 2021
6d59a81
Simplify code flow, enhance Python, address reviewers.
GabrielBrascher May 11, 2021
c37b39a
Create KvmHaHelper class. Enhance HA validations by checking if host …
GabrielBrascher May 19, 2021
b5191e8
Fix checkstyle
GabrielBrascher May 19, 2021
e4c11fa
Add tests and enhance HA flow via new KvmHaHelper
GabrielBrascher May 19, 2021
de8f68e
Fix isKvmHaWebserviceEnabled
GabrielBrascher May 19, 2021
7b4e086
Add test cases and config key
GabrielBrascher May 22, 2021
891a095
Change KvmHaAcceptedProblematicHostsRatio config key description
GabrielBrascher May 26, 2021
e2f20e3
Address reviewer enhancing log and 'if' conditional
GabrielBrascher Jun 2, 2021
10cd13b
Enhance Documentation & remove unnecessary Catch (them all)
GabrielBrascher Jun 21, 2021
839d13c
Fine adjustments
GabrielBrascher Jun 23, 2021
a9e9a8e
Re-check packaging
GabrielBrascher Jun 24, 2021
25d8a0b
Changes on centos7 packaging for agent-ha-helper
GabrielBrascher Jul 2, 2021
4ec91b5
Remove unnecessary line after rebasing and fixing conflicts with main
GabrielBrascher Jul 7, 2021
25ffd18
Remove nested IF in KvmHaHelper
GabrielBrascher Jul 7, 2021
9b519e2
Address review removing nested IF and enhancing KvmHaAgentClient.prep…
GabrielBrascher Jul 7, 2021
178a09c
change header
GabrielBrascher Jul 8, 2021
ddd8aba
Fix cloud.spec with permissions for cloudstack-agent-ha-helper
GabrielBrascher Jul 16, 2021
d760b55
Use Stream instead of For loop.
GabrielBrascher Jul 16, 2021
19b1884
Use Cluster scope for KvmHaAcceptedProblematicHostsRatio
GabrielBrascher Aug 11, 2021
a5d139c
Remove unused variable
GabrielBrascher Aug 12, 2021
955c65e
Enhance KVM HA checks via neighbour hosts.
GabrielBrascher Aug 16, 2021
89ff240
Enhance isKvmHaWebserviceEnabled
GabrielBrascher Aug 16, 2021
0e94c1d
Avoid issues in case of big cluster, and thus limit number of rertrie…
GabrielBrascher Aug 16, 2021
8b5faa7
Port python HTTP client to HTTPS
GabrielBrascher Nov 19, 2021
95c58d9
Rebase against main branch, fix test erros after updating default por…
GabrielBrascher Jan 4, 2022
a1271d2
Add SSL & Authentication support into the agent-ha-helper.py
GabrielBrascher Jan 18, 2022
5918bef
Update KVM HA Client to mach with KVM HA Helper HTTPS server.
GabrielBrascher Jan 21, 2022
10c8636
Update "check-neighbour" and client/server HTTPs with Auth
GabrielBrascher Mar 9, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add global settings configurations; enhance documentation; work on tests
Change VMs counting
Fix test case issues and enhance documentations and code
  • Loading branch information
GabrielBrascher committed Mar 4, 2022
commit 4bafec0e02324acc725366a2797854b2f60762f3
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,12 @@
import com.cloud.host.Status;
import com.cloud.host.dao.HostDao;
import com.cloud.hypervisor.Hypervisor;
import com.cloud.hypervisor.kvm.resource.KvmAgentHaClient;
import com.cloud.resource.ResourceManager;
import com.cloud.storage.Storage.StoragePoolType;
import com.cloud.utils.component.AdapterBase;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.dao.VMInstanceDao;
import org.apache.cloudstack.ha.HAManager;
import org.apache.cloudstack.kvm.ha.KvmHaAgentClient;
import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao;
import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;
import org.apache.log4j.Logger;
Expand Down Expand Up @@ -86,21 +85,29 @@ public Status isAgentAlive(Host agent) {
boolean hasNfs = isHostServedByNfsPool(agent);
if (hasNfs) {
agentStatus = checkAgentStatusViaNfs(agent);
s_logger.debug(String.format("Agent investigation was requested on host %s, agent status via NFS storage is %s.", agent, agentStatus));
s_logger.debug(String.format("Agent investigation was requested on host %s. Agent status via NFS heartbeat is %s.", agent, agentStatus));
} else {
s_logger.debug(String.format("Agent investigation was requested on host %s, but host has no NFS storage. Skipping investigation via NFS.", agent));
}

List<VMInstanceVO> vmsOnHost = vmInstanceDao.listByHostId(agent.getId());
KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(agent);
boolean isVmsOnKvmMatchingWithDatabase = kvmAgentHaClient.checkAgentHealthAndRunningVms(vmsOnHost.size());
if(isVmsOnKvmMatchingWithDatabase) {
agentStatus = checkAgentStatusViaKvmHaAgent(agent, agentStatus);

return agentStatus;
}

/**
* It checks the KVM node healthy via KVM HA Agent. If the agent is healthy it returns Status.Up, otherwise it relies keeps the provided Status as it is.
*/
private Status checkAgentStatusViaKvmHaAgent(Host agent, Status agentStatus) {
KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(agent);

boolean isVmsCountOnKvmMatchingWithDatabase = kvmHaAgentClient.isKvmHaAgentHealthy(agent, vmInstanceDao);
if(isVmsCountOnKvmMatchingWithDatabase) {
agentStatus = Status.Up;
s_logger.debug(String.format("Checking agent %s status; KVM HA webserver is Running as expected."));
s_logger.debug(String.format("Checking agent %s status; KVM HA Agent is Running as expected."));
} else {
s_logger.warn(String.format("Checking agent %s status. Failed to check host status via KVM Agent HA webserver"));
s_logger.warn(String.format("Checking agent %s status. Failed to check host status via KVM HA Agent"));
}

return agentStatus;
}

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,11 @@ public class KVMHAConfig {
public static final ConfigKey<Long> KvmHAFenceTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.fence.timeout", "60",
"The maximum length of time, in seconds, expected for a fence operation to complete.", true, ConfigKey.Scope.Cluster);

public static final ConfigKey<Integer> KVM_HA_WEBSERVICE_PORT = new ConfigKey<Integer>("Advanced", Integer.class, "kvm.ha.webservice.port", "8080",
"It sets the port used to communicate with the KVM HA Agent Microservice that is running on KVM nodes. Default value is 8080.",
true, ConfigKey.Scope.Cluster);

public static final ConfigKey<Boolean> KVM_HA_WEBSERVICE_ENABLED = new ConfigKey<Boolean>("Advanced", Boolean.class, "kvm.ha.webservice.enabled", "true",
"The KVM HA Webservice is executed on the KVM node and checks the amount of VMs running via libvirt. It serves as a HA health-check for KVM nodes. One can enable (set to 'true') or disable it ('false'). If disabled then CloudStack ignores HA validation via this agent.",
true, ConfigKey.Scope.Cluster);
}
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ public ConfigKey<?>[] getConfigKeys() {
KVMHAConfig.KvmHAActivityCheckFailureThreshold,
KVMHAConfig.KvmHADegradedMaxPeriod,
KVMHAConfig.KvmHARecoverWaitPeriod,
KVMHAConfig.KvmHARecoverAttemptThreshold
KVMHAConfig.KvmHARecoverAttemptThreshold,
KVMHAConfig.KVM_HA_WEBSERVICE_PORT,
KVMHAConfig.KVM_HA_WEBSERVICE_ENABLED
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.CheckOnHostCommand;
import com.cloud.agent.api.CheckVMActivityOnStoragePoolCommand;
import com.cloud.dc.ClusterVO;
import com.cloud.dc.dao.ClusterDao;
import com.cloud.exception.StorageUnavailableException;
import com.cloud.host.Host;
import com.cloud.host.HostVO;
import com.cloud.host.Status;
import com.cloud.hypervisor.Hypervisor;
import com.cloud.hypervisor.kvm.resource.KvmAgentHaClient;
import com.cloud.resource.ResourceManager;
import com.cloud.storage.Storage;
import com.cloud.storage.StorageManager;
Expand All @@ -51,6 +52,7 @@
import java.util.HashMap;
import java.util.List;


public class KVMHostActivityChecker extends AdapterBase implements ActivityCheckerInterface<Host>, HealthCheckerInterface<Host> {
private final static Logger LOG = Logger.getLogger(KVMHostActivityChecker.class);

Expand All @@ -66,6 +68,8 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck
private StorageManager storageManager;
@Inject
private ResourceManager resourceManager;
@Inject
private ClusterDao clusterDao;

@Override
public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException {
Expand All @@ -87,21 +91,41 @@ public boolean isHealthy(Host r) {
HashMap<StoragePool, List<Volume>> poolVolMap = getVolumeUuidOnHost(r);
isHealthy = isHealthyCheckViaNfs(r, isHealthy, poolVolMap);

KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(r);
List<VMInstanceVO> vmsOnHost = vmInstanceDao.listByHostId(r.getId());
boolean checkKvmHeatlh = kvmAgentHaClient.checkAgentHealthAndRunningVms(vmsOnHost.size());
isHealthy = checkHealthViaKvmHaWebservice(r, isHealthy);

if(!isHealthy && checkKvmHeatlh) {
isHealthy = true;
return isHealthy;
}

/**
* Checks the host healthy via an web-service that retrieves Running KVM instances via libvirt. <br>
* The health-check is executed on the KVM node and verifies the amount of VMs running and if the libvirt service is running. <br><br>
*
* One can enable or disable it via global settings 'kvm.ha.webservice.enabled'.
*/
private boolean checkHealthViaKvmHaWebservice(Host r, boolean isHealthy) {
KvmHaAgentClient kvmHaAgentClient = new KvmHaAgentClient(r);
if(!kvmHaAgentClient.isKvmHaWebserviceEnabled()) {
ClusterVO cluster = clusterDao.findById(r.getClusterId());
LOG.debug(String.format("Skipping KVM HA web-service verification for %s due to 'kvm.ha.webservice.enabled' not enabled for cluster [id: %d, name: %s].",
r.toString(), cluster.getId(), cluster.getName()));
return isHealthy;
}

// List<VMInstanceVO> vmsOnHost = kvmHaAgentClient.listVmsRunningMigratingStopping(r);
// List<VMInstanceVO> vmsOnHost = vmInstanceDao.listByHostAndState(r.getId(), VirtualMachine.State.Running);
// vmsOnHost.addAll(vmInstanceDao.listByHostAndState(r.getId(), VirtualMachine.State.Stopping));
// vmsOnHost.addAll(vmInstanceDao.listByHostAndState(r.getId(), VirtualMachine.State.Migrating));
boolean isKvmHaAgentHealthy = kvmHaAgentClient.isKvmHaAgentHealthy(r, vmInstanceDao);

if (!isHealthy && isKvmHaAgentHealthy) {
isHealthy = true;
}
return isHealthy;
}

private boolean isHealthyCheckViaNfs(Host r, boolean isHealthy, HashMap<StoragePool, List<Volume>> poolVolMap) {
for (StoragePool pool : poolVolMap.keySet()) {
if(Storage.StoragePoolType.NetworkFilesystem == pool.getPoolType()
|| Storage.StoragePoolType.ManagedNFS == pool.getPoolType()
|| Storage.StoragePoolType.ManagedNFS == pool.getPoolType()) {
isHealthy = isAgentActive(r);
}
Expand Down Expand Up @@ -176,35 +200,29 @@ private boolean isVMActivtyOnHost(Host agent, DateTime suspectTime) throws HAChe
if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType()));
}
boolean activityStatus = false;
boolean activityStatus = true;
HashMap<StoragePool, List<Volume>> poolVolMap = getVolumeUuidOnHost(agent);
for (StoragePool pool : poolVolMap.keySet()) {
if(Storage.StoragePoolType.NetworkFilesystem == pool.getPoolType()
|| Storage.StoragePoolType.ManagedNFS == pool.getPoolType()
|| Storage.StoragePoolType.ManagedNFS == pool.getPoolType()) {
activityStatus = verifyActivityOfStorageOnHost(poolVolMap, pool, agent, suspectTime, activityStatus);
activityStatus = checkVmActivityOnStoragePool(poolVolMap, pool, agent, suspectTime, activityStatus);
if (!activityStatus) {
LOG.warn(String.format("It seems that the storage pool [%s] does not have activity on %s.", pool.getId(), agent.toString()));
break;
}
}
}

KvmAgentHaClient kvmAgentHaClient = new KvmAgentHaClient(agent);
List<VMInstanceVO> vmsOnHost = vmInstanceDao.listByHostId(agent.getId());
boolean isVmsOnKvmMatchingWithDatabase = kvmAgentHaClient.checkAgentHealthAndRunningVms(vmsOnHost.size());
activityStatus = checkHealthViaKvmHaWebservice(agent, activityStatus);

if(!activityStatus && isVmsOnKvmMatchingWithDatabase) {
activityStatus = true;
} else {
if(!activityStatus){
LOG.warn(String.format("No VM activity detected on %s. This might trigger HA Host Recovery and/or Fence.", agent.toString()));
}

return activityStatus;
}


protected boolean verifyActivityOfStorageOnHost(HashMap<StoragePool, List<Volume>> poolVolMap, StoragePool pool, Host agent, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException {
private boolean checkVmActivityOnStoragePool(HashMap<StoragePool, List<Volume>> poolVolMap, StoragePool pool, Host agent, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException {
List<Volume> volume_list = poolVolMap.get(pool);
final CheckVMActivityOnStoragePoolCommand cmd = new CheckVMActivityOnStoragePoolCommand(agent, pool, volume_list, suspectTime);

Expand Down
Loading