From adf876d69eb8a16836a892b098f3deb391251a12 Mon Sep 17 00:00:00 2001 From: Marcus Sorensen Date: Fri, 7 Jul 2023 08:54:42 -0600 Subject: [PATCH 1/2] Trigger out of band VM state update via libvirt event when VM stops --- .../src/main/java/com/cloud/agent/Agent.java | 22 ++++++- .../java/com/cloud/agent/api/PingCommand.java | 11 ++++ .../cloud/resource/AgentStatusUpdater.java | 5 ++ .../cloud/resource/ResourceStatusUpdater.java | 11 ++++ .../cloud/vm/VirtualMachineManagerImpl.java | 2 +- .../vm/VirtualMachinePowerStateSync.java | 2 +- .../vm/VirtualMachinePowerStateSyncImpl.java | 12 ++-- .../resource/LibvirtComputingResource.java | 66 ++++++++++++++++++- 8 files changed, 121 insertions(+), 10 deletions(-) create mode 100644 core/src/main/java/com/cloud/resource/AgentStatusUpdater.java create mode 100644 core/src/main/java/com/cloud/resource/ResourceStatusUpdater.java diff --git a/agent/src/main/java/com/cloud/agent/Agent.java b/agent/src/main/java/com/cloud/agent/Agent.java index e9213ca9b8c5..b1ec592b9fed 100644 --- a/agent/src/main/java/com/cloud/agent/Agent.java +++ b/agent/src/main/java/com/cloud/agent/Agent.java @@ -40,6 +40,8 @@ import javax.naming.ConfigurationException; +import com.cloud.resource.AgentStatusUpdater; +import com.cloud.resource.ResourceStatusUpdater; import com.cloud.utils.NumbersUtil; import org.apache.cloudstack.agent.lb.SetupMSListAnswer; import org.apache.cloudstack.agent.lb.SetupMSListCommand; @@ -100,7 +102,7 @@ * For more configuration options, see the individual types. * **/ -public class Agent implements HandlerFactory, IAgentControl { +public class Agent implements HandlerFactory, IAgentControl, AgentStatusUpdater { protected static Logger s_logger = Logger.getLogger(Agent.class); public enum ExitStatus { @@ -409,6 +411,20 @@ public void scheduleWatch(final Link link, final Request request, final long del } } + public void triggerUpdate() { + PingCommand command = _resource.getCurrentStatus(getId()); + command.setOutOfBand(true); + s_logger.debug("Sending out of band ping"); + + final Request request = new Request(_id, -1, command, false); + request.setSequence(getNextSequence()); + try { + _link.send(request.toBytes()); + } catch (final ClosedChannelException e) { + s_logger.warn("Unable to send ping update: " + request.toString()); + } + } + protected void cancelTasks() { synchronized (_watchList) { for (final WatchTask task : _watchList) { @@ -461,6 +477,10 @@ public void sendStartup(final Link link) { } catch (final ClosedChannelException e) { s_logger.warn("Unable to send request: " + request.toString()); } + + if (_resource instanceof ResourceStatusUpdater) { + ((ResourceStatusUpdater) _resource).registerStatusUpdater(this); + } } } diff --git a/core/src/main/java/com/cloud/agent/api/PingCommand.java b/core/src/main/java/com/cloud/agent/api/PingCommand.java index 1d62c5d13597..4192fc2e7474 100644 --- a/core/src/main/java/com/cloud/agent/api/PingCommand.java +++ b/core/src/main/java/com/cloud/agent/api/PingCommand.java @@ -24,6 +24,7 @@ public class PingCommand extends Command { Host.Type hostType; long hostId; + boolean outOfBand; protected PingCommand() { } @@ -33,6 +34,12 @@ public PingCommand(Host.Type type, long id) { hostId = id; } + public PingCommand(Host.Type type, long id, boolean oob) { + hostType = type; + hostId = id; + outOfBand = oob; + } + public Host.Type getHostType() { return hostType; } @@ -41,6 +48,10 @@ public long getHostId() { return hostId; } + public boolean getOutOfBand() { return outOfBand; } + + public void setOutOfBand(boolean oob) { this.outOfBand = oob; } + @Override public boolean executeInSequence() { return false; diff --git a/core/src/main/java/com/cloud/resource/AgentStatusUpdater.java b/core/src/main/java/com/cloud/resource/AgentStatusUpdater.java new file mode 100644 index 000000000000..d9fa3398fdab --- /dev/null +++ b/core/src/main/java/com/cloud/resource/AgentStatusUpdater.java @@ -0,0 +1,5 @@ +package com.cloud.resource; + +public interface AgentStatusUpdater { + void triggerUpdate(); +} diff --git a/core/src/main/java/com/cloud/resource/ResourceStatusUpdater.java b/core/src/main/java/com/cloud/resource/ResourceStatusUpdater.java new file mode 100644 index 000000000000..402ef7135c9a --- /dev/null +++ b/core/src/main/java/com/cloud/resource/ResourceStatusUpdater.java @@ -0,0 +1,11 @@ +package com.cloud.resource; + +/** + * ResourceStatusUpdater is a resource that can trigger out of band status updates + */ +public interface ResourceStatusUpdater { + /** + * @param updater The object to call triggerUpdate() on + */ + void registerStatusUpdater(AgentStatusUpdater updater); +} diff --git a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java index cf188cbf58d4..9b1a83281903 100755 --- a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java @@ -3726,7 +3726,7 @@ public boolean processCommands(final long agentId, final long seq, final Command if (cmd instanceof PingRoutingCommand) { final PingRoutingCommand ping = (PingRoutingCommand)cmd; if (ping.getHostVmStateReport() != null) { - _syncMgr.processHostVmStatePingReport(agentId, ping.getHostVmStateReport()); + _syncMgr.processHostVmStatePingReport(agentId, ping.getHostVmStateReport(), ping.getOutOfBand()); } scanStalledVMInTransitionStateOnUpHost(agentId); diff --git a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachinePowerStateSync.java b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachinePowerStateSync.java index 152d0d889c62..b2a48a026a3c 100644 --- a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachinePowerStateSync.java +++ b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachinePowerStateSync.java @@ -27,7 +27,7 @@ public interface VirtualMachinePowerStateSync { void processHostVmStateReport(long hostId, Map report); // to adapt legacy ping report - void processHostVmStatePingReport(long hostId, Map report); + void processHostVmStatePingReport(long hostId, Map report, boolean force); Map convertVmStateReport(Map states); } diff --git a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachinePowerStateSyncImpl.java b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachinePowerStateSyncImpl.java index 815206a33bf7..3eb3569cab0a 100644 --- a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachinePowerStateSyncImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachinePowerStateSyncImpl.java @@ -55,19 +55,19 @@ public void processHostVmStateReport(long hostId, Map translatedInfo = convertVmStateReport(report); - processReport(hostId, translatedInfo); + processReport(hostId, translatedInfo, false); } @Override - public void processHostVmStatePingReport(long hostId, Map report) { + public void processHostVmStatePingReport(long hostId, Map report, boolean force) { if (s_logger.isDebugEnabled()) s_logger.debug("Process host VM state report from ping process. host: " + hostId); Map translatedInfo = convertVmStateReport(report); - processReport(hostId, translatedInfo); + processReport(hostId, translatedInfo, force); } - private void processReport(long hostId, Map translatedInfo) { + private void processReport(long hostId, Map translatedInfo, boolean force) { if (s_logger.isDebugEnabled()) { s_logger.debug("Process VM state report. host: " + hostId + ", number of records in report: " + translatedInfo.size()); @@ -117,7 +117,7 @@ private void processReport(long hostId, Map tra // Make sure powerState is up to date for missing VMs try { - if (!_instanceDao.isPowerStateUpToDate(instance.getId())) { + if (!force && !_instanceDao.isPowerStateUpToDate(instance.getId())) { s_logger.warn("Detected missing VM but power state is outdated, wait for another process report run for VM id: " + instance.getId()); _instanceDao.resetVmPowerStateTracking(instance.getId()); continue; @@ -150,7 +150,7 @@ private void processReport(long hostId, Map tra long milliSecondsSinceLastStateUpdate = currentTime.getTime() - vmStateUpdateTime.getTime(); - if (milliSecondsSinceLastStateUpdate > milliSecondsGracefullPeriod) { + if (force || milliSecondsSinceLastStateUpdate > milliSecondsGracefullPeriod) { s_logger.debug("vm id: " + instance.getId() + " - time since last state update(" + milliSecondsSinceLastStateUpdate + "ms) has passed graceful period"); // this is were a race condition might have happened if we don't re-fetch the instance; diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java index b1004f0442a5..3e06211bcf16 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java @@ -83,6 +83,7 @@ import org.libvirt.DomainInfo.DomainState; import org.libvirt.DomainInterfaceStats; import org.libvirt.DomainSnapshot; +import org.libvirt.Library; import org.libvirt.LibvirtException; import org.libvirt.MemoryStatistic; import org.libvirt.Network; @@ -90,6 +91,8 @@ import org.libvirt.SchedUlongParameter; import org.libvirt.Secret; import org.libvirt.VcpuInfo; +import org.libvirt.event.DomainEventDetail; +import org.libvirt.event.StoppedDetail; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; @@ -97,6 +100,7 @@ import org.xml.sax.InputSource; import org.xml.sax.SAXException; + import com.cloud.agent.api.Answer; import com.cloud.agent.api.Command; import com.cloud.agent.api.HostVmStateReportEntry; @@ -175,6 +179,8 @@ import com.cloud.network.Networks.IsolationType; import com.cloud.network.Networks.RouterPrivateIpStrategy; import com.cloud.network.Networks.TrafficType; +import com.cloud.resource.AgentStatusUpdater; +import com.cloud.resource.ResourceStatusUpdater; import com.cloud.resource.RequestWrapper; import com.cloud.resource.ServerResource; import com.cloud.resource.ServerResourceBase; @@ -224,11 +230,12 @@ * private mac addresses for domrs | mac address | start + 126 || || * pool | the parent of the storage pool hierarchy * } **/ -public class LibvirtComputingResource extends ServerResourceBase implements ServerResource, VirtualRouterDeployer { +public class LibvirtComputingResource extends ServerResourceBase implements ServerResource, VirtualRouterDeployer, ResourceStatusUpdater { protected static Logger s_logger = Logger.getLogger(LibvirtComputingResource.class); private static final String CONFIG_VALUES_SEPARATOR = ","; + private static final String LEGACY = "legacy"; private static final String SECURE = "secure"; @@ -457,6 +464,7 @@ public class LibvirtComputingResource extends ServerResourceBase implements Serv protected CPUStat _cpuStat = new CPUStat(); protected MemStat _memStat = new MemStat(_dom0MinMem, _dom0OvercommitMem); private final LibvirtUtilitiesHelper libvirtUtilitiesHelper = new LibvirtUtilitiesHelper(); + private AgentStatusUpdater _agentStatusUpdater; protected Boolean enableManuallySettingCpuTopologyOnKvmVm = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.ENABLE_MANUALLY_SETTING_CPU_TOPOLOGY_ON_KVM_VM); @@ -481,6 +489,11 @@ protected long getHypervisorQemuVersion() { return _hypervisorQemuVersion; } + @Override + public void registerStatusUpdater(AgentStatusUpdater updater) { + _agentStatusUpdater = updater; + } + @Override public ExecutionResult executeInVR(final String routerIp, final String script, final String args) { return executeInVR(routerIp, script, args, _timeout); @@ -3590,9 +3603,60 @@ private StartupStorageCommand createLocalStoragePool(String localStoragePath, St } catch (final CloudRuntimeException e) { s_logger.debug("Unable to initialize local storage pool: " + e); } + setupLibvirtEventListener(); return sscmd; } + private void setupLibvirtEventListener() { + try { + final Thread t = new Thread(() -> { + try { + Library.runEventLoop(); + } catch (LibvirtException e) { + s_logger.error("LibvirtException was thrown in event loop: ", e); + } catch (InterruptedException e) { + s_logger.error("Libvirt event loop was interrupted: ", e); + } + }); + t.setDaemon(true); + t.start(); + + Connect conn = LibvirtConnection.getConnection(); + conn.addLifecycleListener((domain, domainEvent) -> { + try { + s_logger.debug(String.format("Got event lifecycle change on Domain %s, event %s", domain.getName(), domainEvent)); + if (domainEvent != null) { + switch (domainEvent.getType()) { + case STOPPED: + /* libvirt-destroyed VMs have detail StoppedDetail.DESTROYED, self shutdown guests are StoppedDetail.SHUTDOWN + * Checking for this helps us differentiate between events where cloudstack or admin stopped the VM vs guest + * initiated, and avoid pushing extra updates for actions we are initiating without a need for extra tracking */ + DomainEventDetail detail = domainEvent.getDetail(); + if (StoppedDetail.SHUTDOWN.equals(detail) || StoppedDetail.CRASHED.equals(detail)) { + s_logger.info("Triggering out of band status update due to completed self-shutdown or crash of VM"); + _agentStatusUpdater.triggerUpdate(); + } else { + s_logger.debug("Event detail: " + detail); + } + break; + default: + s_logger.debug(String.format("No handling for event %s", domainEvent)); + } + } + } catch (LibvirtException e) { + s_logger.error("Libvirt exception while processing lifecycle event", e); + } catch (Throwable e) { + s_logger.error("Error during lifecycle", e); + } + return 0; + }); + + s_logger.debug("Set up the libvirt domain event lifecycle listener"); + } catch (LibvirtException e) { + s_logger.error("Failed to get libvirt connection for domain event lifecycle", e); + } + } + public String diskUuidToSerial(String uuid) { String uuidWithoutHyphen = uuid.replace("-",""); return uuidWithoutHyphen.substring(0, Math.min(uuidWithoutHyphen.length(), 20)); From f598354f95278351d07135ec00142ac8b11e617c Mon Sep 17 00:00:00 2001 From: Marcus Sorensen Date: Mon, 18 Sep 2023 09:23:36 -0600 Subject: [PATCH 2/2] Add License headers, refactor nested try --- .../cloud/resource/AgentStatusUpdater.java | 22 +++++ .../cloud/resource/ResourceStatusUpdater.java | 18 ++++ .../resource/LibvirtComputingResource.java | 82 ++++++++++--------- 3 files changed, 83 insertions(+), 39 deletions(-) diff --git a/core/src/main/java/com/cloud/resource/AgentStatusUpdater.java b/core/src/main/java/com/cloud/resource/AgentStatusUpdater.java index d9fa3398fdab..63d5576c0602 100644 --- a/core/src/main/java/com/cloud/resource/AgentStatusUpdater.java +++ b/core/src/main/java/com/cloud/resource/AgentStatusUpdater.java @@ -1,5 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. package com.cloud.resource; +/** + * AgentStatusUpdater is an agent with triggerable update functionality + */ public interface AgentStatusUpdater { + /** + * Trigger the sending of an update (Ping). + */ void triggerUpdate(); } diff --git a/core/src/main/java/com/cloud/resource/ResourceStatusUpdater.java b/core/src/main/java/com/cloud/resource/ResourceStatusUpdater.java index 402ef7135c9a..df59e3a152e8 100644 --- a/core/src/main/java/com/cloud/resource/ResourceStatusUpdater.java +++ b/core/src/main/java/com/cloud/resource/ResourceStatusUpdater.java @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. package com.cloud.resource; /** @@ -5,6 +21,8 @@ */ public interface ResourceStatusUpdater { /** + * Register an AgentStatusUpdater to use for triggering out of band updates. + * * @param updater The object to call triggerUpdate() on */ void registerStatusUpdater(AgentStatusUpdater updater); diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java index 3e06211bcf16..ea52486e0677 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java @@ -91,6 +91,7 @@ import org.libvirt.SchedUlongParameter; import org.libvirt.Secret; import org.libvirt.VcpuInfo; +import org.libvirt.event.DomainEvent; import org.libvirt.event.DomainEventDetail; import org.libvirt.event.StoppedDetail; import org.w3c.dom.Document; @@ -3608,48 +3609,22 @@ private StartupStorageCommand createLocalStoragePool(String localStoragePath, St } private void setupLibvirtEventListener() { + final Thread libvirtListenerThread = new Thread(() -> { + try { + Library.runEventLoop(); + } catch (LibvirtException e) { + s_logger.error("LibvirtException was thrown in event loop: ", e); + } catch (InterruptedException e) { + s_logger.error("Libvirt event loop was interrupted: ", e); + } + }); + try { - final Thread t = new Thread(() -> { - try { - Library.runEventLoop(); - } catch (LibvirtException e) { - s_logger.error("LibvirtException was thrown in event loop: ", e); - } catch (InterruptedException e) { - s_logger.error("Libvirt event loop was interrupted: ", e); - } - }); - t.setDaemon(true); - t.start(); + libvirtListenerThread.setDaemon(true); + libvirtListenerThread.start(); Connect conn = LibvirtConnection.getConnection(); - conn.addLifecycleListener((domain, domainEvent) -> { - try { - s_logger.debug(String.format("Got event lifecycle change on Domain %s, event %s", domain.getName(), domainEvent)); - if (domainEvent != null) { - switch (domainEvent.getType()) { - case STOPPED: - /* libvirt-destroyed VMs have detail StoppedDetail.DESTROYED, self shutdown guests are StoppedDetail.SHUTDOWN - * Checking for this helps us differentiate between events where cloudstack or admin stopped the VM vs guest - * initiated, and avoid pushing extra updates for actions we are initiating without a need for extra tracking */ - DomainEventDetail detail = domainEvent.getDetail(); - if (StoppedDetail.SHUTDOWN.equals(detail) || StoppedDetail.CRASHED.equals(detail)) { - s_logger.info("Triggering out of band status update due to completed self-shutdown or crash of VM"); - _agentStatusUpdater.triggerUpdate(); - } else { - s_logger.debug("Event detail: " + detail); - } - break; - default: - s_logger.debug(String.format("No handling for event %s", domainEvent)); - } - } - } catch (LibvirtException e) { - s_logger.error("Libvirt exception while processing lifecycle event", e); - } catch (Throwable e) { - s_logger.error("Error during lifecycle", e); - } - return 0; - }); + conn.addLifecycleListener(this::onDomainLifecycleChange); s_logger.debug("Set up the libvirt domain event lifecycle listener"); } catch (LibvirtException e) { @@ -3657,6 +3632,35 @@ private void setupLibvirtEventListener() { } } + private int onDomainLifecycleChange(Domain domain, DomainEvent domainEvent) { + try { + s_logger.debug(String.format("Got event lifecycle change on Domain %s, event %s", domain.getName(), domainEvent)); + if (domainEvent != null) { + switch (domainEvent.getType()) { + case STOPPED: + /* libvirt-destroyed VMs have detail StoppedDetail.DESTROYED, self shutdown guests are StoppedDetail.SHUTDOWN + * Checking for this helps us differentiate between events where cloudstack or admin stopped the VM vs guest + * initiated, and avoid pushing extra updates for actions we are initiating without a need for extra tracking */ + DomainEventDetail detail = domainEvent.getDetail(); + if (StoppedDetail.SHUTDOWN.equals(detail) || StoppedDetail.CRASHED.equals(detail)) { + s_logger.info("Triggering out of band status update due to completed self-shutdown or crash of VM"); + _agentStatusUpdater.triggerUpdate(); + } else { + s_logger.debug("Event detail: " + detail); + } + break; + default: + s_logger.debug(String.format("No handling for event %s", domainEvent)); + } + } + } catch (LibvirtException e) { + s_logger.error("Libvirt exception while processing lifecycle event", e); + } catch (Throwable e) { + s_logger.error("Error during lifecycle", e); + } + return 0; + } + public String diskUuidToSerial(String uuid) { String uuidWithoutHyphen = uuid.replace("-",""); return uuidWithoutHyphen.substring(0, Math.min(uuidWithoutHyphen.length(), 20));