Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server: improve behavior of starting VMs that are waiting for Crucible activation #873

Merged
merged 13 commits into from
Mar 14, 2025
Merged
10 changes: 5 additions & 5 deletions bin/propolis-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,8 @@ enum Command {
/// Call the VolumeConstructionRequest replace endpoint
Vcr {
/// Uuid for the disk
#[clap(short = 'u', action)]
uuid: Uuid,
#[clap(short = 'd', action)]
disk_id: String,

/// File with a JSON InstanceVcrReplace struct
#[clap(long, action)]
Expand Down Expand Up @@ -510,7 +510,7 @@ async fn new_instance(

async fn replace_vcr(
client: &Client,
id: Uuid,
id: String,
vcr_replace: InstanceVcrReplace,
) -> anyhow::Result<()> {
// Try to call the endpoint
Expand Down Expand Up @@ -941,9 +941,9 @@ async fn main() -> anyhow::Result<()> {
}
Command::Monitor => monitor(addr).await?,
Command::InjectNmi => inject_nmi(&client).await?,
Command::Vcr { uuid, vcr_replace } => {
Command::Vcr { disk_id, vcr_replace } => {
let replace: InstanceVcrReplace = parse_json_file(&vcr_replace)?;
replace_vcr(&client, uuid, replace).await?
replace_vcr(&client, disk_id, replace).await?
}
}

Expand Down
75 changes: 20 additions & 55 deletions bin/propolis-server/src/lib/vm/objects.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@ use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};

use crate::{serial::Serial, spec::Spec, vcpu_tasks::VcpuTaskController};

use super::{
state_driver::VmStartReason, BlockBackendMap, CrucibleBackendMap, DeviceMap,
};
use super::{BlockBackendMap, CrucibleBackendMap, DeviceMap};

/// A collection of components that make up a Propolis VM instance.
pub(crate) struct VmObjects {
Expand Down Expand Up @@ -189,6 +187,14 @@ impl VmObjectsLocked {
&self.ps2ctrl
}

pub(crate) fn device_map(&self) -> &DeviceMap {
&self.devices
}

pub(crate) fn block_backend_map(&self) -> &BlockBackendMap {
&self.block_backends
}

/// Iterates over all of the lifecycle trait objects in this VM and calls
/// `func` on each one.
pub(crate) fn for_each_device(
Expand Down Expand Up @@ -244,33 +250,6 @@ impl VmObjectsLocked {
self.machine.reinitialize().unwrap();
}

/// Starts a VM's devices and allows all of its vCPU tasks to run.
///
/// This function may be called either after initializing a new VM from
/// scratch or after an inbound live migration. In the latter case, this
/// routine assumes that the caller initialized and activated the VM's vCPUs
/// prior to importing state from the migration source.
pub(super) async fn start(
&mut self,
reason: VmStartReason,
) -> anyhow::Result<()> {
match reason {
VmStartReason::ExplicitRequest => {
self.reset_vcpus();
}
VmStartReason::MigratedIn => {
self.resume_kernel_vm();
}
}

let result = self.start_devices().await;
if result.is_ok() {
self.vcpu_tasks.resume_all();
}

result
}

/// Pauses this VM's devices and its kernel VMM.
pub(crate) async fn pause(&mut self) {
// Order matters here: the Propolis lifecycle trait's pause function
Expand All @@ -288,6 +267,16 @@ impl VmObjectsLocked {
// that all devices resume before any vCPUs do.
self.resume_kernel_vm();
self.resume_devices();
self.resume_vcpus();
}

/// Resumes this VM's vCPU tasks.
///
/// This is intended for use in VM startup sequences where the state driver
/// needs fine-grained control over the order in which devices and vCPUs
/// start. When pausing and resuming a VM that's already been started, use
/// [`Self::pause`] and [`Self::resume`] instead.
pub(crate) fn resume_vcpus(&mut self) {
self.vcpu_tasks.resume_all();
}

Expand Down Expand Up @@ -321,31 +310,7 @@ impl VmObjectsLocked {
// Resume devices so they're ready to do more work, then resume
// vCPUs.
self.resume_devices();
self.vcpu_tasks.resume_all();
}

/// Starts all of a VM's devices and allows its block backends to process
/// requests from their devices.
async fn start_devices(&self) -> anyhow::Result<()> {
self.for_each_device_fallible(|name, dev| {
info!(self.log, "sending startup complete to {}", name);
let res = dev.start();
if let Err(e) = &res {
error!(self.log, "startup failed for {}: {:?}", name, e);
}
res
})?;

for (name, backend) in self.block_backends.iter() {
info!(self.log, "starting block backend {}", name);
let res = backend.start().await;
if let Err(e) = &res {
error!(self.log, "startup failed for {}: {:?}", name, e);
return res;
}
}

Ok(())
self.resume_vcpus();
}

/// Pauses all of a VM's devices.
Expand Down
6 changes: 0 additions & 6 deletions bin/propolis-server/src/lib/vm/request_queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,6 @@ impl std::fmt::Debug for StateChangeRequest {
pub enum ComponentChangeRequest {
/// Attempts to update the volume construction request for the supplied
/// Crucible volume.
///
/// TODO: Due to https://github.com/oxidecomputer/crucible/issues/871, this
/// is only allowed once the VM is started and the volume has activated, but
/// it should be allowed even before the VM has started.
ReconfigureCrucibleVolume {
/// The ID of the Crucible backend in the VM's Crucible backend map.
backend_id: SpecKey,
Expand Down Expand Up @@ -356,7 +352,6 @@ impl ExternalRequestQueue {
"request" => ?request,
"reason" => %reason
);

return Err(reason);
}
}
Expand Down Expand Up @@ -795,7 +790,6 @@ mod test {
fn mutation_disallowed_after_stopped() {
let mut queue =
ExternalRequestQueue::new(test_logger(), InstanceAutoStart::Yes);

queue.notify_request_completed(CompletedRequest::Start {
succeeded: true,
});
Expand Down
Loading
Loading