Skip to content

Commit bd7a8cf

Browse files
authored
propolis-server should not crash when failing to start a VM (#866)
this approach fixes the immediate issue, but also makes sure that we have a clear way to do fallible init on the way to VmEnsureObjectsCreated. another approach here could be to move the call tree leading to VMM runtime creation and init into a separate thread, be it spawn_blocking or a scoped thread. this would start from `create_and_activate_vm()` to capture both the migration-based and local-request-based init. i'm just hesitant about moving more than strictly needed into "strange" scopes that might make future-us have to think twice about the execution environment.
1 parent 4f60515 commit bd7a8cf

File tree

1 file changed

+63
-29
lines changed

1 file changed

+63
-29
lines changed

bin/propolis-server/src/lib/vm/ensure.rs

+63-29
Original file line numberDiff line numberDiff line change
@@ -236,39 +236,67 @@ impl<'a> VmEnsureNotStarted<'a> {
236236
},
237237
));
238238

239-
// Create the runtime that will host tasks created by VMM components
240-
// (e.g. block device runtime tasks).
241-
let vmm_rt = tokio::runtime::Builder::new_multi_thread()
242-
.thread_name("tokio-rt-vmm")
243-
.worker_threads(usize::max(
244-
VMM_MIN_RT_THREADS,
245-
VMM_BASE_RT_THREADS + spec.board.cpus as usize,
246-
))
247-
.enable_all()
248-
.build()?;
249-
250239
let log_for_init = self.log.clone();
251240
let properties = self.ensure_request.properties.clone();
252241
let options = self.ensure_options.clone();
253242
let queue_for_init = input_queue.clone();
254-
let init_result = vmm_rt
255-
.spawn(async move {
256-
initialize_vm_objects(
257-
log_for_init,
258-
spec,
259-
properties,
260-
options,
261-
queue_for_init,
262-
)
263-
.await
264-
})
265-
.await
266-
.map_err(|e| {
267-
anyhow::anyhow!("failed to join VM object creation task: {e}")
268-
})?;
269-
270-
match init_result {
271-
Ok(objects) => {
243+
244+
// Either the block following this succeeds with both a Tokio runtime
245+
// and VM objects, or entirely fails with no partial state for us to
246+
// clean up.
247+
type InitResult =
248+
anyhow::Result<(tokio::runtime::Runtime, InputVmObjects)>;
249+
250+
// We need to create a new runtime to host the tasks for this VMM's
251+
// objects, but that initialization is fallible and results in dropping
252+
// the fledgling VMM runtime itself. Dropping a Tokio runtime on a
253+
// worker thread in a Tokio runtime will panic, so do all init in a
254+
// `spawn_blocking` where this won't be an issue.
255+
//
256+
// When the runtime is returned to this thread, it must not be dropped.
257+
// That means that the path between this result and returning an
258+
// `Ok(VmEnsureObjectsCreated)` must be infallible.
259+
let result: InitResult = tokio::task::spawn_blocking(move || {
260+
// Create the runtime that will host tasks created by
261+
// VMM components (e.g. block device runtime tasks).
262+
let vmm_rt = tokio::runtime::Builder::new_multi_thread()
263+
.thread_name("tokio-rt-vmm")
264+
.worker_threads(usize::max(
265+
VMM_MIN_RT_THREADS,
266+
VMM_BASE_RT_THREADS + spec.board.cpus as usize,
267+
))
268+
.enable_all()
269+
.build()?;
270+
271+
let init_result = vmm_rt
272+
.block_on(async move {
273+
initialize_vm_objects(
274+
log_for_init,
275+
spec,
276+
properties,
277+
options,
278+
queue_for_init,
279+
)
280+
.await
281+
})
282+
.map_err(|e| {
283+
anyhow::anyhow!(
284+
"failed to join VM object creation task: {e}"
285+
)
286+
})?;
287+
Ok((vmm_rt, init_result))
288+
})
289+
.await
290+
.map_err(|e| {
291+
// This is extremely unexpected: if the join failed, the init
292+
// task panicked or was cancelled. If the init itself failed,
293+
// which is somewhat more reasonable, we would expect the join
294+
// to succeed and have an error below.
295+
anyhow::anyhow!("failed to join VMM runtime init task: {e}")
296+
})?;
297+
298+
match result {
299+
Ok((vmm_rt, objects)) => {
272300
// N.B. Once these `VmObjects` exist, it is no longer safe to
273301
// call `vm_init_failed`.
274302
let objects = Arc::new(VmObjects::new(
@@ -310,6 +338,12 @@ impl<'a> VmEnsureNotStarted<'a> {
310338
/// Represents an instance ensure request that has proceeded far enough to
311339
/// create a set of VM objects, but that has not yet installed those objects as
312340
/// an `ActiveVm` or notified the requestor that its request is complete.
341+
///
342+
/// WARNING: dropping `VmEnsureObjectsCreated` is a panic risk since dropping
343+
/// the contained `tokio::runtime::Runtime` on in a worker thread will panic. It
344+
/// is probably a bug to drop `VmEnsureObjectsCreated`, as it is expected users
345+
/// will quickly call [`VmEnsureObjectsCreated::ensure_active`], but if you
346+
/// must, take care in handling the contained `vmm_rt`.
313347
pub(crate) struct VmEnsureObjectsCreated<'a> {
314348
log: &'a slog::Logger,
315349
vm: &'a Arc<super::Vm>,

0 commit comments

Comments
 (0)