From ca877592081e0ae8dcd390a51817e5e8fa40fc85 Mon Sep 17 00:00:00 2001
From: Greg Colombo <greg@oxidecomputer.com>
Date: Wed, 31 Jan 2024 01:02:49 +0000
Subject: [PATCH 1/4] PHD: handle Ctrl-C more gracefully

Install a signal handler in the PHD runner that catches SIGINT and relays it via
`tokio::sync::watch` to any tasks that are currently executing tests. This
allows the test tasks to exit at the next available `await` point. Since these
tasks are themselves awaited before the runner exits, this ensures that all VMs
and disks created by a test can be dropped and cleaned up before the runner goes
away.

Ctrl-C characters sent to a shell will typically cause SIGINT to be sent to all
processes in the foreground process group. This is bad for PHD because any
Propolis servers the runner spawned will be killed before `TestVm::drop` gets a
chance to gracefully stop the VMs inside. Since dropping a `PropolisServer`
kills the process in any case, get around this by configuring PHD-spawned
Propolis servers to ignore SIGINT.

Tested with local Debian 11 runs, observing in particular that interrupting a
run with Ctrl-C does not leak any VMMs or temporary disk objects.
---
 phd-tests/framework/src/test_vm/server.rs | 58 ++++++++++++----
 phd-tests/runner/src/execute.rs           | 82 +++++++++++++++++++----
 2 files changed, 114 insertions(+), 26 deletions(-)

diff --git a/phd-tests/framework/src/test_vm/server.rs b/phd-tests/framework/src/test_vm/server.rs
index 039a44b03..1587926ad 100644
--- a/phd-tests/framework/src/test_vm/server.rs
+++ b/phd-tests/framework/src/test_vm/server.rs
@@ -4,7 +4,7 @@
 
 //! Routines and data structures for working with Propolis server processes.
 
-use std::{fmt::Debug, net::SocketAddrV4};
+use std::{fmt::Debug, net::SocketAddrV4, os::unix::process::CommandExt};
 
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
@@ -62,18 +62,52 @@ impl PropolisServer {
         let (server_stdout, server_stderr) =
             log_mode.get_handles(&data_dir, vm_name)?;
 
+        // In the normal course of events, dropping a `TestVm` will spawn a task
+        // that takes ownership of its `PropolisServer` and then sends Propolis
+        // client calls to gracefully stop the VM running in that server (if
+        // there is one). This ensures (modulo any bugs in Propolis itself) that
+        // any Propolis servers hosting a running VM will have a chance to
+        // destroy their kernel VMMs before the servers themselves go away.
+        //
+        // The PHD runner tries to give VMs a chance to tear down gracefully on
+        // Ctrl-C by installing a custom SIGINT handler and using it to tell
+        // running test tasks to shut down instead of completing their tests.
+        // This allows the tests' VMs to be dropped gracefully. The trouble is
+        // that with no additional intervention, pressing Ctrl-C to interrupt a
+        // PHD runner process will typically cause SIGINTs to be delivered to
+        // every process in the foreground process group, including both the
+        // runner and all its Propolis server children. This breaks the `TestVm`
+        // cleanup logic: by the time the VM is cleaned up, its server process
+        // is already gone (due to the signal), so its VMM can't be cleaned up.
+        //
+        // To get around this, spawn Propolis server processes with a pre-`exec`
+        // hook that directs them to ignore SIGINT. They will still be killed
+        // during `TestVm` cleanup, assuming that executes normally; if the
+        // runner is rudely terminated after that, the server process is still
+        // preserved for investigation (and can still be cleaned up by other
+        // means, e.g. SIGKILL).
+        //
+        // Note that it's undesirable for PHD to try to clean up leaked kernel
+        // VMMs itself: leaking a kernel VMM after gracefully stopping a
+        // Propolis server VM indicates a Propolis server bug.
         let server = PropolisServer {
-            server: std::process::Command::new("pfexec")
-                .args([
-                    server_path.as_str(),
-                    "run",
-                    config_toml_path.as_str(),
-                    server_addr.to_string().as_str(),
-                    vnc_addr.to_string().as_str(),
-                ])
-                .stdout(server_stdout)
-                .stderr(server_stderr)
-                .spawn()?,
+            server: unsafe {
+                std::process::Command::new("pfexec")
+                    .args([
+                        server_path.as_str(),
+                        "run",
+                        config_toml_path.as_str(),
+                        server_addr.to_string().as_str(),
+                        vnc_addr.to_string().as_str(),
+                    ])
+                    .stdout(server_stdout)
+                    .stderr(server_stderr)
+                    .pre_exec(move || {
+                        libc::signal(libc::SIGINT, libc::SIG_IGN);
+                        Ok(())
+                    })
+                    .spawn()?
+            },
             address: server_addr,
         };
 
diff --git a/phd-tests/runner/src/execute.rs b/phd-tests/runner/src/execute.rs
index 91f57f280..c3d120ff7 100644
--- a/phd-tests/runner/src/execute.rs
+++ b/phd-tests/runner/src/execute.rs
@@ -6,7 +6,9 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use phd_tests::phd_testcase::{Framework, TestCase, TestOutcome};
-use tracing::{error, info};
+use tokio::signal::unix::{signal, SignalKind};
+use tokio::sync::watch;
+use tracing::{error, info, warn};
 
 use crate::config::RunOptions;
 use crate::fixtures::TestFixtures;
@@ -76,10 +78,15 @@ pub async fn run_tests_with_ctx(
     }
 
     fixtures.execution_setup().unwrap();
-
+    let sigint_rx = set_sigint_handler();
     info!("Running {} test(s)", executions.len());
     let start_time = Instant::now();
-    'exec_loop: for execution in &mut executions {
+    for execution in &mut executions {
+        if *sigint_rx.borrow() {
+            info!("Test run interrupted by SIGINT");
+            break;
+        }
+
         info!("Starting test {}", execution.tc.fully_qualified_name());
 
         // Failure to run a setup fixture is fatal to the rest of the run, but
@@ -87,21 +94,36 @@ pub async fn run_tests_with_ctx(
         // of panicking.
         if let Err(e) = fixtures.test_setup() {
             error!("Error running test setup fixture: {}", e);
-            break 'exec_loop;
+            break;
         }
 
         stats.tests_not_run -= 1;
         let test_ctx = ctx.clone();
         let tc = execution.tc;
-        let test_outcome =
-            match tokio::spawn(async move { tc.run(test_ctx.as_ref()).await })
-                .await
-            {
-                Ok(outcome) => outcome,
-                Err(_) => TestOutcome::Failed(Some(
-                    "test task panicked, see test logs".to_string(),
-                )),
-            };
+        let mut sigint_rx_task = sigint_rx.clone();
+        let test_outcome = match tokio::spawn(async move {
+            tokio::select! {
+                biased;
+                result = sigint_rx_task.changed() => {
+                    assert!(
+                        result.is_ok(),
+                        "SIGINT channel shouldn't drop while tests are running"
+                    );
+
+                    TestOutcome::Failed(
+                        Some("test interrupted by SIGINT".to_string())
+                    )
+                }
+                outcome = tc.run(test_ctx.as_ref()) => outcome
+            }
+        })
+        .await
+        {
+            Ok(outcome) => outcome,
+            Err(_) => TestOutcome::Failed(Some(
+                "test task panicked, see test logs".to_string(),
+            )),
+        };
 
         info!(
             "test {} ... {}{}",
@@ -132,7 +154,7 @@ pub async fn run_tests_with_ctx(
         execution.status = Status::Ran(test_outcome);
         if let Err(e) = fixtures.test_cleanup().await {
             error!("Error running cleanup fixture: {}", e);
-            break 'exec_loop;
+            break;
         }
     }
     stats.duration = start_time.elapsed();
@@ -141,3 +163,35 @@ pub async fn run_tests_with_ctx(
 
     stats
 }
+
+/// Sets a global handler for SIGINT and hands the resulting signal channel over
+/// to a task that handles this signal. Returns a receiver to which the signal
+/// handler task publishes `true` to the channel when SIGINT is received.
+fn set_sigint_handler() -> watch::Receiver<bool> {
+    let mut sigint =
+        signal(SignalKind::interrupt()).expect("failed to set SIGINT handler");
+
+    let (sigint_tx, sigint_rx) = watch::channel(false);
+    tokio::spawn(async move {
+        loop {
+            sigint.recv().await;
+
+            // If a signal was previously dispatched to the channel, exit
+            // immediately with the customary SIGINT exit code (130 is 128 +
+            // SIGINT). This allows users to interrupt tests even if they aren't
+            // at an await point (at the cost of not having destructors run).
+            if *sigint_tx.borrow() {
+                error!(
+                    "SIGINT received while shutting down, rudely terminating"
+                );
+                error!("some processes and resources may have been leaked!");
+                std::process::exit(130);
+            }
+
+            warn!("SIGINT received, sending shutdown signal to tests");
+            let _ = sigint_tx.send(true);
+        }
+    });
+
+    sigint_rx
+}

From acf692dad25a1672d3289077f6c3c5a968c3d67e Mon Sep 17 00:00:00 2001
From: Greg Colombo <greg@oxidecomputer.com>
Date: Wed, 31 Jan 2024 03:41:18 +0000
Subject: [PATCH 2/4] add comment

---
 phd-tests/runner/src/execute.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/phd-tests/runner/src/execute.rs b/phd-tests/runner/src/execute.rs
index c3d120ff7..930cab0d0 100644
--- a/phd-tests/runner/src/execute.rs
+++ b/phd-tests/runner/src/execute.rs
@@ -103,6 +103,8 @@ pub async fn run_tests_with_ctx(
         let mut sigint_rx_task = sigint_rx.clone();
         let test_outcome = match tokio::spawn(async move {
             tokio::select! {
+                // Ensure interrupt signals are always handled instead of
+                // continuing to run the test.
                 biased;
                 result = sigint_rx_task.changed() => {
                     assert!(

From c655dfbd2e8995f9ae7e9a17f50f9fe9ce781ec7 Mon Sep 17 00:00:00 2001
From: Greg Colombo <greg@oxidecomputer.com>
Date: Wed, 7 Feb 2024 19:36:29 +0000
Subject: [PATCH 3/4] clean up unneeded match

---
 phd-tests/runner/src/execute.rs | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/phd-tests/runner/src/execute.rs b/phd-tests/runner/src/execute.rs
index 930cab0d0..3126e3a31 100644
--- a/phd-tests/runner/src/execute.rs
+++ b/phd-tests/runner/src/execute.rs
@@ -101,7 +101,7 @@ pub async fn run_tests_with_ctx(
         let test_ctx = ctx.clone();
         let tc = execution.tc;
         let mut sigint_rx_task = sigint_rx.clone();
-        let test_outcome = match tokio::spawn(async move {
+        let test_outcome = tokio::spawn(async move {
             tokio::select! {
                 // Ensure interrupt signals are always handled instead of
                 // continuing to run the test.
@@ -120,12 +120,11 @@ pub async fn run_tests_with_ctx(
             }
         })
         .await
-        {
-            Ok(outcome) => outcome,
-            Err(_) => TestOutcome::Failed(Some(
+        .unwrap_or_else(|_| {
+            TestOutcome::Failed(Some(
                 "test task panicked, see test logs".to_string(),
-            )),
-        };
+            ))
+        });
 
         info!(
             "test {} ... {}{}",

From 7f54723708fc45b10813b340d49f796a74772561 Mon Sep 17 00:00:00 2001
From: Greg Colombo <greg@oxidecomputer.com>
Date: Wed, 14 Feb 2024 20:01:03 +0000
Subject: [PATCH 4/4] reduce scope of unsafe

---
 phd-tests/framework/src/test_vm/server.rs | 79 +++++++++++------------
 1 file changed, 36 insertions(+), 43 deletions(-)

diff --git a/phd-tests/framework/src/test_vm/server.rs b/phd-tests/framework/src/test_vm/server.rs
index 1587926ad..0e6596c28 100644
--- a/phd-tests/framework/src/test_vm/server.rs
+++ b/phd-tests/framework/src/test_vm/server.rs
@@ -62,52 +62,45 @@ impl PropolisServer {
         let (server_stdout, server_stderr) =
             log_mode.get_handles(&data_dir, vm_name)?;
 
-        // In the normal course of events, dropping a `TestVm` will spawn a task
-        // that takes ownership of its `PropolisServer` and then sends Propolis
-        // client calls to gracefully stop the VM running in that server (if
-        // there is one). This ensures (modulo any bugs in Propolis itself) that
-        // any Propolis servers hosting a running VM will have a chance to
-        // destroy their kernel VMMs before the servers themselves go away.
+        let mut server_cmd = std::process::Command::new("pfexec");
+        server_cmd
+            .args([
+                server_path.as_str(),
+                "run",
+                config_toml_path.as_str(),
+                server_addr.to_string().as_str(),
+                vnc_addr.to_string().as_str(),
+            ])
+            .stdout(server_stdout)
+            .stderr(server_stderr);
+
+        // Gracefully shutting down a Propolis server requires PHD to send an
+        // instance stop request to the server before it is actually terminated.
+        // This ensures that the server has a chance to clean up kernel VMM
+        // resources. It's desirable for the server to do this and not PHD
+        // because a failure to clean up VMMs on stop is a Propolis bug.
         //
-        // The PHD runner tries to give VMs a chance to tear down gracefully on
-        // Ctrl-C by installing a custom SIGINT handler and using it to tell
-        // running test tasks to shut down instead of completing their tests.
-        // This allows the tests' VMs to be dropped gracefully. The trouble is
-        // that with no additional intervention, pressing Ctrl-C to interrupt a
-        // PHD runner process will typically cause SIGINTs to be delivered to
-        // every process in the foreground process group, including both the
-        // runner and all its Propolis server children. This breaks the `TestVm`
-        // cleanup logic: by the time the VM is cleaned up, its server process
-        // is already gone (due to the signal), so its VMM can't be cleaned up.
+        // The PHD runner sets up a SIGINT handler that tries to give the
+        // framework an opportunity to issue these requests before the runner
+        // exits. However, pressing Ctrl-C in a shell will typically broadcast
+        // SIGINT to all of the processes in the foreground process's group, not
+        // just to the foreground process itself. This means that a Ctrl-C press
+        // will usually kill all of PHD's Propolis servers before the cleanup
+        // logic can run.
         //
-        // To get around this, spawn Propolis server processes with a pre-`exec`
-        // hook that directs them to ignore SIGINT. They will still be killed
-        // during `TestVm` cleanup, assuming that executes normally; if the
-        // runner is rudely terminated after that, the server process is still
-        // preserved for investigation (and can still be cleaned up by other
-        // means, e.g. SIGKILL).
-        //
-        // Note that it's undesirable for PHD to try to clean up leaked kernel
-        // VMMs itself: leaking a kernel VMM after gracefully stopping a
-        // Propolis server VM indicates a Propolis server bug.
+        // To avoid this problem, add a pre-`exec` hook that directs Propolis
+        // servers to ignore SIGINT. On Ctrl-C, the runner will drop all active
+        // `TestVm`s, and this drop path (if allowed to complete) will kill all
+        // these processes.
+        unsafe {
+            server_cmd.pre_exec(move || {
+                libc::signal(libc::SIGINT, libc::SIG_IGN);
+                Ok(())
+            });
+        }
+
         let server = PropolisServer {
-            server: unsafe {
-                std::process::Command::new("pfexec")
-                    .args([
-                        server_path.as_str(),
-                        "run",
-                        config_toml_path.as_str(),
-                        server_addr.to_string().as_str(),
-                        vnc_addr.to_string().as_str(),
-                    ])
-                    .stdout(server_stdout)
-                    .stderr(server_stderr)
-                    .pre_exec(move || {
-                        libc::signal(libc::SIGINT, libc::SIG_IGN);
-                        Ok(())
-                    })
-                    .spawn()?
-            },
+            server: server_cmd.spawn()?,
             address: server_addr,
         };