Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions bin/propolis-server/src/lib/vm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,19 @@ enum StateDriverEvent {
/// functionality.
#[cfg_attr(test, mockall::automock)]
trait StateDriverVmController {
/// Pause VM at the kernel VMM level, ensuring that in-kernel-emulated
/// devices and vCPUs are brought to a consistent state.
///
/// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl)
/// will fail. A corresponding `resume_vm()` call must be made prior to
/// allowing vCPU tasks to run.
fn pause_vm(&self);

/// Resume a previously-paused VM at the kernel VMM level. This will resume
/// any timers driving in-kernel-emulated devices, and allow the vCPU to run
/// again.
fn resume_vm(&self);

/// Sends a reset request to each entity in the instance, then sends a
/// reset command to the instance's bhyve VM.
fn reset_entities_and_machine(&self);
Expand All @@ -865,6 +878,26 @@ trait StateDriverVmController {
}

impl StateDriverVmController for VmController {
fn pause_vm(&self) {
info!(self.log, "Pausing kernel VMM resources");
self.instance()
.lock()
.machine()
.hdl
.pause()
.expect("VM_PAUSE should succeed")
}

fn resume_vm(&self) {
info!(self.log, "Resuming kernel VMM resources");
self.instance()
.lock()
.machine()
.hdl
.resume()
.expect("VM_RESUME should succeed")
}

fn reset_entities_and_machine(&self) {
self.for_each_entity(|ent, rec| {
info!(self.log, "Sending reset request to {}", rec.name());
Expand Down
61 changes: 61 additions & 0 deletions bin/propolis-server/src/lib/vm/state_driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,10 @@ where
self.get_instance_state(),
ApiInstanceState::Migrating
);
// Signal the kernel VMM to resume devices which are handled by
// the in-kernel emulation. They were kept paused for
// consistency while migration state was loaded.
self.controller.resume_vm();
}
}

Expand Down Expand Up @@ -347,6 +351,10 @@ where
// migration.
self.reset_vcpus();

// Place the VM in a paused state so we can load emulated device state
// in a consistent manner
self.controller.pause_vm();

start_tx.send(()).unwrap();
loop {
let action = self.runtime_hdl.block_on(async {
Expand Down Expand Up @@ -437,6 +445,7 @@ where
));

if self.paused {
self.controller.resume_vm();
self.controller.resume_entities();
self.vcpu_tasks.resume_all();
self.paused = false;
Expand All @@ -455,6 +464,7 @@ where
MigrateSourceCommand::Pause => {
self.vcpu_tasks.pause_all();
self.controller.pause_entities();
self.controller.pause_vm();
self.paused = true;
response_tx
.blocking_send(MigrateSourceResponse::Pause(Ok(())))
Expand Down Expand Up @@ -810,6 +820,8 @@ mod tests {
vm_ctrl.expect_halt_entities().times(1).returning(|| ());
vm_ctrl.expect_resume_entities().never();
vcpu_ctrl.expect_resume_all().never();
vm_ctrl.expect_pause_vm().times(1).returning(|| ());
vm_ctrl.expect_resume_vm().never();

let mut driver = make_state_driver(test_objects);

Expand Down Expand Up @@ -893,6 +905,20 @@ mod tests {
vcpu_ctrl.expect_pause_all().times(1).returning(|| ());
vcpu_ctrl.expect_resume_all().times(1).returning(|| ());

// VMM will be paused once prior to exporting state, and then resumed
// afterwards when the migration fails.
let mut pause_seq = Sequence::new();
vm_ctrl
.expect_pause_vm()
.times(1)
.in_sequence(&mut pause_seq)
.returning(|| ());
vm_ctrl
.expect_resume_vm()
.times(1)
.in_sequence(&mut pause_seq)
.returning(|| ());

let mut driver = make_state_driver(test_objects);
let hdl = std::thread::spawn(move || {
let outcome = driver.driver.handle_event(
Expand Down Expand Up @@ -961,6 +987,18 @@ mod tests {
vm_ctrl.expect_start_entities().times(1).returning(|| Ok(()));
vcpu_ctrl.expect_resume_all().times(1).returning(|| ());

let mut pause_seq = Sequence::new();
vm_ctrl
.expect_pause_vm()
.times(1)
.in_sequence(&mut pause_seq)
.returning(|| ());
vm_ctrl
.expect_resume_vm()
.times(1)
.in_sequence(&mut pause_seq)
.returning(|| ());

let mut driver = make_state_driver(test_objects);

// The state driver expects to run on an OS thread outside the async
Expand Down Expand Up @@ -1021,6 +1059,8 @@ mod tests {

vcpu_ctrl.expect_new_generation().times(1).returning(|| ());
vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ());
vm_ctrl.expect_pause_vm().times(1).returning(|| ());
vm_ctrl.expect_resume_vm().never();
let mut driver = make_state_driver(test_objects);

// The state driver expects to run on an OS thread outside the async
Expand Down Expand Up @@ -1084,6 +1124,19 @@ mod tests {

vcpu_ctrl.expect_new_generation().times(1).returning(|| ());
vm_ctrl.expect_reset_vcpu_state().times(1).returning(|| ());

let mut pause_seq = Sequence::new();
vm_ctrl
.expect_pause_vm()
.times(1)
.in_sequence(&mut pause_seq)
.returning(|| ());
vm_ctrl
.expect_resume_vm()
.times(1)
.in_sequence(&mut pause_seq)
.returning(|| ());

vm_ctrl
.expect_start_entities()
.times(1)
Expand Down Expand Up @@ -1148,6 +1201,14 @@ mod tests {
vcpu_ctrl.expect_resume_all().times(1).returning(|| ());
vm_ctrl.expect_start_entities().times(1).returning(|| Ok(()));

// As noted below, the instance state is being magicked directly into a
// `Migrating` state, rather than executing the logic which would
// typically carry it there. As such, `pause_vm()` will not be called
// as part of setup. Since instance start _is_ being tested here, the
// `resume_vm()` call is expected.
vm_ctrl.expect_pause_vm().never();
vm_ctrl.expect_resume_vm().times(1).returning(|| ());

// Skip the rigmarole of standing up a fake migration. Instead, just
// push the driver into the state it would have after a successful
// migration to appease the assertions in `start_vm`.
Expand Down
58 changes: 45 additions & 13 deletions crates/bhyve-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::io::{Error, ErrorKind, Result};
use std::os::fd::*;
use std::os::unix::fs::OpenOptionsExt;
use std::path::PathBuf;
use std::sync::atomic::{AtomicI64, Ordering};

use num_enum::IntoPrimitive;

Expand Down Expand Up @@ -193,19 +194,46 @@ impl AsRawFd for VmmFd {
}
}

/// Check that bhyve kernel VMM component matches version underlying interfaces
/// defined in bhyve-api. Can use a user-provided version (via `version` arg)
/// or the current one known to bhyve_api.
pub fn check_version(version: Option<u32>) -> Result<bool> {
let ctl = VmmCtlFd::open()?;
let current_vers = unsafe {
ctl.ioctl(
ioctls::VMM_INTERFACE_VERSION,
std::ptr::null_mut() as *mut libc::c_void,
)
}?;
/// Store a cached copy of the queried API version. Negative values indicate an
/// error occurred during query (and hold the corresponding negated `errno`).
/// A positive value indicates the cached version, and should be less than
/// `u32::MAX`. A value of 0 indicates that no query has been performed yet.
static VERSION_CACHE: AtomicI64 = AtomicI64::new(0);

/// Query the API version from the kernel VMM component on the system.
///
/// Caches said version (or any emitted error) for later calls.
pub fn api_version() -> Result<u32> {
fn do_query() -> Result<u32> {
let ctl = VmmCtlFd::open()?;
let vers = ctl.api_version()?;
Ok(vers)
}

if VERSION_CACHE.load(Ordering::Acquire) == 0 {
let newval = match do_query() {
Ok(x) => x as i64,
Err(e) => -(e.raw_os_error().unwrap_or(libc::ENOENT) as i64),
};
let _ = VERSION_CACHE.compare_exchange(
0,
newval,
Ordering::Relaxed,
Ordering::Relaxed,
);
}

Ok(current_vers as u32 == version.unwrap_or(VMM_CURRENT_INTERFACE_VERSION))
match VERSION_CACHE.load(Ordering::Acquire) {
0 => {
panic!("expected VERSION_CACHE to be initialized")
}
x if x < 0 => Err(Error::from_raw_os_error(-x as i32)),
y => {
assert!(y < u32::MAX as i64);

Ok(y as u32)
}
}
}

#[cfg(target_os = "illumos")]
Expand All @@ -230,6 +258,10 @@ unsafe fn ioctl(
#[repr(u32)]
#[derive(IntoPrimitive)]
pub enum ApiVersion {
/// Interrupt and exception state is properly saved/restored on VM
/// pause/resume, and is exposed via vmm-data interface
V10 = 10,

/// Revamps ioctls for administrating the VMM memory reservoir and adds
/// kstat for tracking its capacity and utilization.
V9 = 9,
Expand All @@ -250,7 +282,7 @@ pub enum ApiVersion {
}
impl ApiVersion {
pub const fn current() -> Self {
Self::V9
Self::V10
}
}

Expand Down
2 changes: 1 addition & 1 deletion crates/bhyve-api/sys/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ pub const VM_MAXCPU: usize = 32;
/// This is the VMM interface version which bhyve_api expects to operate
/// against. All constants and structs defined by the crate are done so in
/// terms of that specific version.
pub const VMM_CURRENT_INTERFACE_VERSION: u32 = 9;
pub const VMM_CURRENT_INTERFACE_VERSION: u32 = 10;
15 changes: 15 additions & 0 deletions crates/bhyve-api/sys/src/vmm_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,9 +191,24 @@ impl Default for vdi_rtc_v1 {

// VDC_VMM_ARCH v1 data identifiers

// VM-wide:

/// Offset of guest TSC from system at time of boot
pub const VAI_TSC_BOOT_OFFSET: u32 = 1;
/// Time that guest (nominally) booted, as hrtime
pub const VAI_BOOT_HRTIME: u32 = 2;
/// Guest TSC frequency measured by hrtime (not effected by wall clock adj.)
pub const VAI_TSC_FREQ: u32 = 3;
/// Guest instance has been placed in paused state
pub const VAI_VM_IS_PAUSED: u32 = 4;

// per-vCPU

/// NMI pending injection for vCPU (0 or 1)
pub const VAI_PEND_NMI: u32 = 10;
/// extint pending injection for vCPU (0 or 1)
pub const VAI_PEND_EXTINT: u32 = 11;
/// HW exception pending injection for vCPU
pub const VAI_PEND_EXCP: u32 = 12;
/// exception/interrupt pending injection for vCPU
pub const VAI_PEND_INTINFO: u32 = 13;
Loading