Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server: improve behavior of starting VMs that are waiting for Crucible activation #873

Merged
merged 13 commits into from
Mar 14, 2025
Merged
75 changes: 20 additions & 55 deletions bin/propolis-server/src/lib/vm/objects.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@ use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};

use crate::{serial::Serial, spec::Spec, vcpu_tasks::VcpuTaskController};

use super::{
state_driver::VmStartReason, BlockBackendMap, CrucibleBackendMap, DeviceMap,
};
use super::{BlockBackendMap, CrucibleBackendMap, DeviceMap};

/// A collection of components that make up a Propolis VM instance.
pub(crate) struct VmObjects {
Expand Down Expand Up @@ -189,6 +187,14 @@ impl VmObjectsLocked {
&self.ps2ctrl
}

pub(crate) fn device_map(&self) -> &DeviceMap {
&self.devices
}

pub(crate) fn block_backend_map(&self) -> &BlockBackendMap {
&self.block_backends
}

/// Iterates over all of the lifecycle trait objects in this VM and calls
/// `func` on each one.
pub(crate) fn for_each_device(
Expand Down Expand Up @@ -244,33 +250,6 @@ impl VmObjectsLocked {
self.machine.reinitialize().unwrap();
}

/// Starts a VM's devices and allows all of its vCPU tasks to run.
///
/// This function may be called either after initializing a new VM from
/// scratch or after an inbound live migration. In the latter case, this
/// routine assumes that the caller initialized and activated the VM's vCPUs
/// prior to importing state from the migration source.
pub(super) async fn start(
&mut self,
reason: VmStartReason,
) -> anyhow::Result<()> {
match reason {
VmStartReason::ExplicitRequest => {
self.reset_vcpus();
}
VmStartReason::MigratedIn => {
self.resume_kernel_vm();
}
}

let result = self.start_devices().await;
if result.is_ok() {
self.vcpu_tasks.resume_all();
}

result
}

/// Pauses this VM's devices and its kernel VMM.
pub(crate) async fn pause(&mut self) {
// Order matters here: the Propolis lifecycle trait's pause function
Expand All @@ -288,6 +267,16 @@ impl VmObjectsLocked {
// that all devices resume before any vCPUs do.
self.resume_kernel_vm();
self.resume_devices();
self.resume_vcpus();
}

/// Resumes this VM's vCPU tasks.
///
/// This is intended for use in VM startup sequences where the state driver
/// needs fine-grained control over the order in which devices and vCPUs
/// start. When pausing and resuming a VM that's already been started, use
/// [`Self::pause`] and [`Self::resume`] instead.
pub(crate) fn resume_vcpus(&mut self) {
self.vcpu_tasks.resume_all();
}

Expand Down Expand Up @@ -321,31 +310,7 @@ impl VmObjectsLocked {
// Resume devices so they're ready to do more work, then resume
// vCPUs.
self.resume_devices();
self.vcpu_tasks.resume_all();
}

/// Starts all of a VM's devices and allows its block backends to process
/// requests from their devices.
async fn start_devices(&self) -> anyhow::Result<()> {
self.for_each_device_fallible(|name, dev| {
info!(self.log, "sending startup complete to {}", name);
let res = dev.start();
if let Err(e) = &res {
error!(self.log, "startup failed for {}: {:?}", name, e);
}
res
})?;

for (name, backend) in self.block_backends.iter() {
info!(self.log, "starting block backend {}", name);
let res = backend.start().await;
if let Err(e) = &res {
error!(self.log, "startup failed for {}: {:?}", name, e);
return res;
}
}

Ok(())
self.resume_vcpus();
}

/// Pauses all of a VM's devices.
Expand Down
37 changes: 22 additions & 15 deletions bin/propolis-server/src/lib/vm/request_queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,6 @@ pub enum ExternalRequest {

/// Attempts to update the volume construction request for the supplied
/// Crucible volume.
///
/// TODO: Due to https://github.com/oxidecomputer/crucible/issues/871, this
/// is only allowed once the VM is started and the volume has activated, but
/// it should be allowed even before the VM has started.
ReconfigureCrucibleVolume {
/// The ID of the Crucible backend in the VM's Crucible backend map.
backend_id: SpecKey,
Expand Down Expand Up @@ -210,9 +206,7 @@ impl ExternalRequestQueue {
reboot: RequestDisposition::Deny(
RequestDeniedReason::InstanceNotActive,
),
mutate: RequestDisposition::Deny(
RequestDeniedReason::InstanceNotActive,
),
mutate: RequestDisposition::Enqueue,
stop: RequestDisposition::Enqueue,
},
log,
Expand Down Expand Up @@ -295,7 +289,7 @@ impl ExternalRequestQueue {
start: Disposition::Ignore,
migrate_as_source: Disposition::Deny(reason),
reboot: Disposition::Deny(reason),
mutate: Disposition::Deny(reason),
mutate: Disposition::Enqueue,
stop: self.allowed.stop,
}
}
Expand Down Expand Up @@ -577,18 +571,27 @@ mod test {
}

#[tokio::test]
async fn mutation_requires_running_and_not_migrating_out() {
async fn mutation_requires_not_migrating_out() {
let mut queue =
ExternalRequestQueue::new(test_logger(), InstanceAutoStart::No);

// Mutating a VM before it has started is not allowed.
assert!(queue.try_queue(make_reconfigure_crucible_request()).is_err());
// Mutating a VM before it has started is allowed.
assert!(queue.try_queue(make_reconfigure_crucible_request()).is_ok());
assert!(matches!(
queue.pop_front(),
Some(ExternalRequest::ReconfigureCrucibleVolume { .. })
));

// Merely dequeuing the start request doesn't allow mutation; the VM
// actually has to be running.
// Mutating a VM is also allowed while it is starting.
assert!(queue.try_queue(ExternalRequest::Start).is_ok());
assert!(matches!(queue.pop_front(), Some(ExternalRequest::Start)));
assert!(queue.try_queue(make_reconfigure_crucible_request()).is_err());
assert!(queue.try_queue(make_reconfigure_crucible_request()).is_ok());
assert!(matches!(
queue.pop_front(),
Some(ExternalRequest::ReconfigureCrucibleVolume { .. })
));

// And it's allowed once the VM has started running.
queue.notify_instance_state_change(InstanceStateChange::StartedRunning);
assert!(queue.try_queue(make_reconfigure_crucible_request()).is_ok());
assert!(matches!(
Expand All @@ -610,10 +613,14 @@ mod test {
}

#[tokio::test]
async fn mutation_disallowed_after_stop() {
async fn mutation_disallowed_after_stop_requested() {
let mut queue =
ExternalRequestQueue::new(test_logger(), InstanceAutoStart::Yes);
queue.notify_instance_state_change(InstanceStateChange::StartedRunning);

assert!(queue.try_queue(ExternalRequest::Stop).is_ok());
assert!(queue.try_queue(make_reconfigure_crucible_request()).is_err());

queue.notify_instance_state_change(InstanceStateChange::Stopped);
assert!(queue.try_queue(make_reconfigure_crucible_request()).is_err());
}
Expand Down
Loading
Loading