From 2f1d33e3dc704415c049ee7181fb374b4c87f505 Mon Sep 17 00:00:00 2001 From: Patrick Mooney Date: Tue, 4 Apr 2023 15:14:54 -0500 Subject: [PATCH] Support illumos#15143 data and vCPU pause/resume With the integration of illumos#15143, additional pending interrupt and exception state for guest vCPUs is exposed to userspace. In order to take a consistent snapshot of said state, as well as the guest devices emulated in-kernel, propolis-server needs to issue VM_PAUSE requests prior to exporting or importing state, and VM_RESUME requests when starting a VM after a migration. Importation of the "new" vCPU state is guarded with a version check for now, so that propolis continues to work on hosts which have not yet updated to bits featuring illumos#15143. --- bin/propolis-server/src/lib/vm/mod.rs | 33 ++++++ .../src/lib/vm/state_driver.rs | 61 +++++++++++ crates/bhyve-api/src/lib.rs | 58 +++++++--- crates/bhyve-api/sys/src/lib.rs | 2 +- crates/bhyve-api/sys/src/vmm_data.rs | 15 +++ lib/propolis/src/vcpu.rs | 100 ++++++++++++++++-- 6 files changed, 249 insertions(+), 20 deletions(-) diff --git a/bin/propolis-server/src/lib/vm/mod.rs b/bin/propolis-server/src/lib/vm/mod.rs index 4c900a837..7a7b0741e 100644 --- a/bin/propolis-server/src/lib/vm/mod.rs +++ b/bin/propolis-server/src/lib/vm/mod.rs @@ -843,6 +843,19 @@ enum StateDriverEvent { /// functionality. #[cfg_attr(test, mockall::automock)] trait StateDriverVmController { + /// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated + /// devices and vCPUs are brought to a consistent state. + /// + /// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl) + /// will fail. A corresponding `resume_vm()` call must be made prior to + /// allowing vCPU tasks to run. + fn pause_vm(&self); + + /// Resume a previously-paused VM at the kernel VMM level. This will resume + /// any timers driving in-kernel-emulated devices, and allow the vCPU to run + /// again. + fn resume_vm(&self); + /// Sends a reset request to each entity in the instance, then sends a /// reset command to the instance's bhyve VM. fn reset_entities_and_machine(&self); @@ -865,6 +878,26 @@ trait StateDriverVmController { } impl StateDriverVmController for VmController { + fn pause_vm(&self) { + info!(self.log, "Pausing kernel VMM resources"); + self.instance() + .lock() + .machine() + .hdl + .pause() + .expect("VM_PAUSE should succeed") + } + + fn resume_vm(&self) { + info!(self.log, "Resuming kernel VMM resources"); + self.instance() + .lock() + .machine() + .hdl + .resume() + .expect("VM_RESUME should succeed") + } + fn reset_entities_and_machine(&self) { self.for_each_entity(|ent, rec| { info!(self.log, "Sending reset request to {}", rec.name()); diff --git a/bin/propolis-server/src/lib/vm/state_driver.rs b/bin/propolis-server/src/lib/vm/state_driver.rs index 239c823ec..637229881 100644 --- a/bin/propolis-server/src/lib/vm/state_driver.rs +++ b/bin/propolis-server/src/lib/vm/state_driver.rs @@ -274,6 +274,10 @@ where self.get_instance_state(), ApiInstanceState::Migrating ); + // Signal the kernel VMM to resume devices which are handled by + // the in-kernel emulation. They were kept paused for + // consistency while migration state was loaded. + self.controller.resume_vm(); } } @@ -347,6 +351,10 @@ where // migration. self.reset_vcpus(); + // Place the VM in a paused state so we can load emulated device state + // in a consistent manner + self.controller.pause_vm(); + start_tx.send(()).unwrap(); loop { let action = self.runtime_hdl.block_on(async { @@ -437,6 +445,7 @@ where )); if self.paused { + self.controller.resume_vm(); self.controller.resume_entities(); self.vcpu_tasks.resume_all(); self.paused = false; @@ -455,6 +464,7 @@ where MigrateSourceCommand::Pause => { self.vcpu_tasks.pause_all(); self.controller.pause_entities(); + self.controller.pause_vm(); self.paused = true; response_tx .blocking_send(MigrateSourceResponse::Pause(Ok(()))) @@ -810,6 +820,8 @@ mod tests { vm_ctrl.expect_halt_entities().times(1).returning(|| ()); vm_ctrl.expect_resume_entities().never(); vcpu_ctrl.expect_resume_all().never(); + vm_ctrl.expect_pause_vm().times(1).returning(|| ()); + vm_ctrl.expect_resume_vm().never(); let mut driver = make_state_driver(test_objects); @@ -893,6 +905,20 @@ mod tests { vcpu_ctrl.expect_pause_all().times(1).returning(|| ()); vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); + // VMM will be paused once prior to exporting state, and then resumed + // afterwards when the migration fails. + let mut pause_seq = Sequence::new(); + vm_ctrl + .expect_pause_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + vm_ctrl + .expect_resume_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + let mut driver = make_state_driver(test_objects); let hdl = std::thread::spawn(move || { let outcome = driver.driver.handle_event( @@ -961,6 +987,18 @@ mod tests { vm_ctrl.expect_start_entities().times(1).returning(|| Ok(())); vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); + let mut pause_seq = Sequence::new(); + vm_ctrl + .expect_pause_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + vm_ctrl + .expect_resume_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + let mut driver = make_state_driver(test_objects); // The state driver expects to run on an OS thread outside the async @@ -1021,6 +1059,8 @@ mod tests { vcpu_ctrl.expect_new_generation().times(1).returning(|| ()); vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ()); + vm_ctrl.expect_pause_vm().times(1).returning(|| ()); + vm_ctrl.expect_resume_vm().never(); let mut driver = make_state_driver(test_objects); // The state driver expects to run on an OS thread outside the async @@ -1084,6 +1124,19 @@ mod tests { vcpu_ctrl.expect_new_generation().times(1).returning(|| ()); vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ()); + + let mut pause_seq = Sequence::new(); + vm_ctrl + .expect_pause_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + vm_ctrl + .expect_resume_vm() + .times(1) + .in_sequence(&mut pause_seq) + .returning(|| ()); + vm_ctrl .expect_start_entities() .times(1) @@ -1148,6 +1201,14 @@ mod tests { vcpu_ctrl.expect_resume_all().times(1).returning(|| ()); vm_ctrl.expect_start_entities().times(1).returning(|| Ok(())); + // As noted below, the instance state is being magicked directly into a + // `Migrating` state, rather than executing the logic which would + // typically carry it there. As such, `pause_vm()` will not be called + // as part of setup. Since instance start _is_ being tested here, the + // `resume_vm()` call is expected. + vm_ctrl.expect_pause_vm().never(); + vm_ctrl.expect_resume_vm().times(1).returning(|| ()); + // Skip the rigmarole of standing up a fake migration. Instead, just // push the driver into the state it would have after a successful // migration to appease the assertions in `start_vm`. diff --git a/crates/bhyve-api/src/lib.rs b/crates/bhyve-api/src/lib.rs index d1324a657..01955af68 100644 --- a/crates/bhyve-api/src/lib.rs +++ b/crates/bhyve-api/src/lib.rs @@ -3,6 +3,7 @@ use std::io::{Error, ErrorKind, Result}; use std::os::fd::*; use std::os::unix::fs::OpenOptionsExt; use std::path::PathBuf; +use std::sync::atomic::{AtomicI64, Ordering}; use num_enum::IntoPrimitive; @@ -193,19 +194,46 @@ impl AsRawFd for VmmFd { } } -/// Check that bhyve kernel VMM component matches version underlying interfaces -/// defined in bhyve-api. Can use a user-provided version (via `version` arg) -/// or the current one known to bhyve_api. -pub fn check_version(version: Option) -> Result { - let ctl = VmmCtlFd::open()?; - let current_vers = unsafe { - ctl.ioctl( - ioctls::VMM_INTERFACE_VERSION, - std::ptr::null_mut() as *mut libc::c_void, - ) - }?; +/// Store a cached copy of the queried API version. Negative values indicate an +/// error occurred during query (and hold the corresponding negated `errno`). +/// A positive value indicates the cached version, and should be less than +/// `u32::MAX`. A value of 0 indicates that no query has been performed yet. +static VERSION_CACHE: AtomicI64 = AtomicI64::new(0); + +/// Query the API version from the kernel VMM component on the system. +/// +/// Caches said version (or any emitted error) for later calls. +pub fn api_version() -> Result { + fn do_query() -> Result { + let ctl = VmmCtlFd::open()?; + let vers = ctl.api_version()?; + Ok(vers) + } + + if VERSION_CACHE.load(Ordering::Acquire) == 0 { + let newval = match do_query() { + Ok(x) => x as i64, + Err(e) => -(e.raw_os_error().unwrap_or(libc::ENOENT) as i64), + }; + let _ = VERSION_CACHE.compare_exchange( + 0, + newval, + Ordering::Relaxed, + Ordering::Relaxed, + ); + } - Ok(current_vers as u32 == version.unwrap_or(VMM_CURRENT_INTERFACE_VERSION)) + match VERSION_CACHE.load(Ordering::Acquire) { + 0 => { + panic!("expected VERSION_CACHE to be initialized") + } + x if x < 0 => Err(Error::from_raw_os_error(-x as i32)), + y => { + assert!(y < u32::MAX as i64); + + Ok(y as u32) + } + } } #[cfg(target_os = "illumos")] @@ -230,6 +258,10 @@ unsafe fn ioctl( #[repr(u32)] #[derive(IntoPrimitive)] pub enum ApiVersion { + /// Interrupt and exception state is properly saved/restored on VM + /// pause/resume, and is exposed via vmm-data interface + V10 = 10, + /// Revamps ioctls for administrating the VMM memory reservoir and adds /// kstat for tracking its capacity and utilization. V9 = 9, @@ -250,7 +282,7 @@ pub enum ApiVersion { } impl ApiVersion { pub const fn current() -> Self { - Self::V9 + Self::V10 } } diff --git a/crates/bhyve-api/sys/src/lib.rs b/crates/bhyve-api/sys/src/lib.rs index 7857ae209..2b224090c 100644 --- a/crates/bhyve-api/sys/src/lib.rs +++ b/crates/bhyve-api/sys/src/lib.rs @@ -13,4 +13,4 @@ pub const VM_MAXCPU: usize = 32; /// This is the VMM interface version which bhyve_api expects to operate /// against. All constants and structs defined by the crate are done so in /// terms of that specific version. -pub const VMM_CURRENT_INTERFACE_VERSION: u32 = 9; +pub const VMM_CURRENT_INTERFACE_VERSION: u32 = 10; diff --git a/crates/bhyve-api/sys/src/vmm_data.rs b/crates/bhyve-api/sys/src/vmm_data.rs index 30685bdf4..ae7705d26 100644 --- a/crates/bhyve-api/sys/src/vmm_data.rs +++ b/crates/bhyve-api/sys/src/vmm_data.rs @@ -191,9 +191,24 @@ impl Default for vdi_rtc_v1 { // VDC_VMM_ARCH v1 data identifiers +// VM-wide: + /// Offset of guest TSC from system at time of boot pub const VAI_TSC_BOOT_OFFSET: u32 = 1; /// Time that guest (nominally) booted, as hrtime pub const VAI_BOOT_HRTIME: u32 = 2; /// Guest TSC frequency measured by hrtime (not effected by wall clock adj.) pub const VAI_TSC_FREQ: u32 = 3; +/// Guest instance has been placed in paused state +pub const VAI_VM_IS_PAUSED: u32 = 4; + +// per-vCPU + +/// NMI pending injection for vCPU (0 or 1) +pub const VAI_PEND_NMI: u32 = 10; +/// extint pending injection for vCPU (0 or 1) +pub const VAI_PEND_EXTINT: u32 = 11; +/// HW exception pending injection for vCPU +pub const VAI_PEND_EXCP: u32 = 12; +/// exception/interrupt pending injection for vCPU +pub const VAI_PEND_INTINFO: u32 = 13; diff --git a/lib/propolis/src/vcpu.rs b/lib/propolis/src/vcpu.rs index 406910870..9eb92177f 100644 --- a/lib/propolis/src/vcpu.rs +++ b/lib/propolis/src/vcpu.rs @@ -271,13 +271,17 @@ pub mod migrate { /// TODO: do not use the bhyve_api type lapic: bhyve_api::vdi_lapic_v1, ms_regs: Vec, - // exception/interrupt state } #[derive(Clone, Default, Deserialize, Serialize)] pub struct BhyveVcpuRunStateV1 { - state: u32, - sipi_vector: u8, + pub run_state: u32, + pub sipi_vector: u8, + + pub pending_nmi: bool, + pub pending_extint: bool, + pub pending_exception: u64, + pub pending_intinfo: u64, } #[derive(Copy, Clone, Default, Deserialize, Serialize)] @@ -351,12 +355,96 @@ pub mod migrate { impl BhyveVcpuRunStateV1 { fn read(vcpu: &Vcpu) -> io::Result { - let state = vcpu.get_run_state()?; - Ok(Self { state: state.state, sipi_vector: state.sipi_vector }) + let run_state = vcpu.get_run_state()?; + + let vmm_arch: Vec = + vmm::data::read_many( + vcpu.hdl.as_ref(), + vcpu.id, + bhyve_api::VDC_VMM_ARCH, + 1, + )?; + + // Load all of the pending interrupt/exception state + // + // If illumos#15143 support is missing, none of these fields will be + // present, so the values will remain false/zeroed. Such an outcome + // is fine for now. + let ( + mut pending_nmi, + mut pending_extint, + mut pending_exception, + mut pending_intinfo, + ) = (false, false, 0, 0); + for ent in vmm_arch.iter() { + match ent.vfe_ident { + bhyve_api::VAI_PEND_NMI => pending_nmi = ent.vfe_value != 0, + bhyve_api::VAI_PEND_EXTINT => { + pending_extint = ent.vfe_value != 0 + } + bhyve_api::VAI_PEND_EXCP => { + pending_exception = ent.vfe_value + } + bhyve_api::VAI_PEND_INTINFO => { + pending_intinfo = ent.vfe_value + } + _ => {} + } + } + + Ok(Self { + run_state: run_state.state, + sipi_vector: run_state.sipi_vector, + pending_nmi, + pending_extint, + pending_exception, + pending_intinfo, + }) } fn write(self, vcpu: &Vcpu) -> io::Result<()> { - vcpu.set_run_state(self.state, Some(self.sipi_vector)) + vcpu.set_run_state(self.run_state, Some(self.sipi_vector))?; + + let mut ents = [ + vdi_field_entry_v1 { + vfe_ident: bhyve_api::VAI_PEND_NMI, + vfe_value: self.pending_nmi as u64, + ..Default::default() + }, + vdi_field_entry_v1 { + vfe_ident: bhyve_api::VAI_PEND_EXTINT, + vfe_value: self.pending_extint as u64, + ..Default::default() + }, + vdi_field_entry_v1 { + vfe_ident: bhyve_api::VAI_PEND_EXCP, + vfe_value: self.pending_exception, + ..Default::default() + }, + vdi_field_entry_v1 { + vfe_ident: bhyve_api::VAI_PEND_INTINFO, + vfe_value: self.pending_intinfo, + ..Default::default() + }, + ]; + + // Do not attempt to import interrupt/exception state unless there + // is proper support for it on the host we are running upon. + // + // When hosts with illumos#15143 integrated become common, the + // overall required version for propolis can grow to encompass V10 + // and this check can be elided. + if bhyve_api::api_version()? >= bhyve_api::ApiVersion::V10 as u32 { + vmm::data::write_many( + vcpu.hdl.as_ref(), + vcpu.id, + bhyve_api::VDC_VMM_ARCH, + 1, + &mut ents, + )?; + } + + Ok(()) } }