Skip to content

Commit 0d89163

Browse files
committed
PHD: handle Ctrl-C more gracefully
Install a signal handler in the PHD runner that catches SIGINT and relays it via `tokio::sync::watch` to any tasks that are currently executing tests. This allows the test tasks to exit at the next available `await` point. Since these tasks are themselves awaited before the runner exits, this ensures that all VMs and disks created by a test can be dropped and cleaned up before the runner goes away. Ctrl-C characters sent to a shell will typically cause SIGINT to be sent to all processes in the foreground process group. This is bad for PHD because any Propolis servers the runner spawned will be killed before `TestVm::drop` gets a chance to gracefully stop the VMs inside. Since dropping a `PropolisServer` kills the process in any case, get around this by configuring PHD-spawned Propolis servers to ignore SIGINT. Tested with local Debian 11 runs, observing in particular that interrupting a run with Ctrl-C does not leak any VMMs or temporary disk objects.
1 parent c8f6937 commit 0d89163

File tree

2 files changed

+114
-26
lines changed

2 files changed

+114
-26
lines changed

phd-tests/framework/src/test_vm/server.rs

+46-12
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
//! Routines and data structures for working with Propolis server processes.
66
7-
use std::{fmt::Debug, net::SocketAddrV4};
7+
use std::{fmt::Debug, net::SocketAddrV4, os::unix::process::CommandExt};
88

99
use anyhow::Result;
1010
use camino::{Utf8Path, Utf8PathBuf};
@@ -62,18 +62,52 @@ impl PropolisServer {
6262
let (server_stdout, server_stderr) =
6363
log_mode.get_handles(&data_dir, vm_name)?;
6464

65+
// In the normal course of events, dropping a `TestVm` will spawn a task
66+
// that takes ownership of its `PropolisServer` and then sends Propolis
67+
// client calls to gracefully stop the VM running in that server (if
68+
// there is one). This ensures (modulo any bugs in Propolis itself) that
69+
// any Propolis servers hosting a running VM will have a chance to
70+
// destroy their kernel VMMs before the servers themselves go away.
71+
//
72+
// The PHD runner tries to give VMs a chance to tear down gracefully on
73+
// Ctrl-C by installing a custom SIGINT handler and using it to tell
74+
// running test tasks to shut down instead of completing their tests.
75+
// This allows the tests' VMs to be dropped gracefully. The trouble is
76+
// that with no additional intervention, pressing Ctrl-C to interrupt a
77+
// PHD runner process will typically cause SIGINTs to be delivered to
78+
// every process in the foreground process group, including both the
79+
// runner and all its Propolis server children. This breaks the `TestVm`
80+
// cleanup logic: by the time the VM is cleaned up, its server process
81+
// is already gone (due to the signal), so its VMM can't be cleaned up.
82+
//
83+
// To get around this, spawn Propolis server processes with a pre-`exec`
84+
// hook that directs them to ignore SIGINT. They will still be killed
85+
// during `TestVm` cleanup, assuming that executes normally; if the
86+
// runner is rudely terminated after that, the server process is still
87+
// preserved for investigation (and can still be cleaned up by other
88+
// means, e.g. SIGKILL).
89+
//
90+
// Note that it's undesirable for PHD to try to clean up leaked kernel
91+
// VMMs itself: leaking a kernel VMM after gracefully stopping a
92+
// Propolis server VM indicates a Propolis server bug.
6593
let server = PropolisServer {
66-
server: std::process::Command::new("pfexec")
67-
.args([
68-
server_path.as_str(),
69-
"run",
70-
config_toml_path.as_str(),
71-
server_addr.to_string().as_str(),
72-
vnc_addr.to_string().as_str(),
73-
])
74-
.stdout(server_stdout)
75-
.stderr(server_stderr)
76-
.spawn()?,
94+
server: unsafe {
95+
std::process::Command::new("pfexec")
96+
.args([
97+
server_path.as_str(),
98+
"run",
99+
config_toml_path.as_str(),
100+
server_addr.to_string().as_str(),
101+
vnc_addr.to_string().as_str(),
102+
])
103+
.stdout(server_stdout)
104+
.stderr(server_stderr)
105+
.pre_exec(move || {
106+
libc::signal(libc::SIGINT, libc::SIG_IGN);
107+
Ok(())
108+
})
109+
.spawn()?
110+
},
77111
address: server_addr,
78112
};
79113

phd-tests/runner/src/execute.rs

+68-14
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ use std::sync::Arc;
66
use std::time::{Duration, Instant};
77

88
use phd_tests::phd_testcase::{Framework, TestCase, TestOutcome};
9-
use tracing::{error, info};
9+
use tokio::signal::unix::{signal, SignalKind};
10+
use tokio::sync::watch;
11+
use tracing::{error, info, warn};
1012

1113
use crate::config::RunOptions;
1214
use crate::fixtures::TestFixtures;
@@ -76,32 +78,52 @@ pub async fn run_tests_with_ctx(
7678
}
7779

7880
fixtures.execution_setup().unwrap();
79-
81+
let sigint_rx = set_sigint_handler();
8082
info!("Running {} test(s)", executions.len());
8183
let start_time = Instant::now();
82-
'exec_loop: for execution in &mut executions {
84+
for execution in &mut executions {
85+
if *sigint_rx.borrow() {
86+
info!("Test run interrupted by SIGINT");
87+
break;
88+
}
89+
8390
info!("Starting test {}", execution.tc.fully_qualified_name());
8491

8592
// Failure to run a setup fixture is fatal to the rest of the run, but
8693
// it's still possible to report results, so return gracefully instead
8794
// of panicking.
8895
if let Err(e) = fixtures.test_setup() {
8996
error!("Error running test setup fixture: {}", e);
90-
break 'exec_loop;
97+
break;
9198
}
9299

93100
stats.tests_not_run -= 1;
94101
let test_ctx = ctx.clone();
95102
let tc = execution.tc;
96-
let test_outcome =
97-
match tokio::spawn(async move { tc.run(test_ctx.as_ref()).await })
98-
.await
99-
{
100-
Ok(outcome) => outcome,
101-
Err(_) => TestOutcome::Failed(Some(
102-
"test task panicked, see test logs".to_string(),
103-
)),
104-
};
103+
let mut sigint_rx_task = sigint_rx.clone();
104+
let test_outcome = match tokio::spawn(async move {
105+
tokio::select! {
106+
biased;
107+
result = sigint_rx_task.changed() => {
108+
assert!(
109+
result.is_ok(),
110+
"SIGINT channel shouldn't drop while tests are running"
111+
);
112+
113+
TestOutcome::Failed(
114+
Some("test interrupted by SIGINT".to_string())
115+
)
116+
}
117+
outcome = tc.run(test_ctx.as_ref()) => outcome
118+
}
119+
})
120+
.await
121+
{
122+
Ok(outcome) => outcome,
123+
Err(_) => TestOutcome::Failed(Some(
124+
"test task panicked, see test logs".to_string(),
125+
)),
126+
};
105127

106128
info!(
107129
"test {} ... {}{}",
@@ -132,7 +154,7 @@ pub async fn run_tests_with_ctx(
132154
execution.status = Status::Ran(test_outcome);
133155
if let Err(e) = fixtures.test_cleanup().await {
134156
error!("Error running cleanup fixture: {}", e);
135-
break 'exec_loop;
157+
break;
136158
}
137159
}
138160
stats.duration = start_time.elapsed();
@@ -141,3 +163,35 @@ pub async fn run_tests_with_ctx(
141163

142164
stats
143165
}
166+
167+
/// Sets a global handler for SIGINT and hands the resulting signal channel over
168+
/// to a task that handles this signal. Returns a receiver to which the signal
169+
/// handler task publishes `true` to the channel when SIGINT is received.
170+
fn set_sigint_handler() -> watch::Receiver<bool> {
171+
let mut sigint =
172+
signal(SignalKind::interrupt()).expect("failed to set SIGINT handler");
173+
174+
let (sigint_tx, sigint_rx) = watch::channel(false);
175+
tokio::spawn(async move {
176+
loop {
177+
sigint.recv().await;
178+
179+
// If a signal was previously dispatched to the channel, exit
180+
// immediately with the customary SIGINT exit code (130 is 128 +
181+
// SIGINT). This allows users to interrupt tests even if they aren't
182+
// at an await point (at the cost of not having destructors run).
183+
if *sigint_tx.borrow() {
184+
error!(
185+
"SIGINT received while shutting down, rudely terminating"
186+
);
187+
error!("some processes and resources may have been leaked!");
188+
std::process::exit(130);
189+
}
190+
191+
warn!("SIGINT received, sending shutdown signal to tests");
192+
let _ = sigint_tx.send(true);
193+
}
194+
});
195+
196+
sigint_rx
197+
}

0 commit comments

Comments
 (0)