From 5a1abced9062c8c5c68fb83d06f688a9f4ef637d Mon Sep 17 00:00:00 2001 From: iximeow Date: Tue, 4 Mar 2025 19:35:56 +0000 Subject: [PATCH 1/7] Use production propolis-server flags in PHD CI builds We use the dev (e.g. non-release) build of propolis-server in PHD to get debug assertions, which we want, but we should use the `omicron-build` feature to be in line with production binaries. `omicron-build` had also acquired code related to failure injection, which we *also* want in CI so we can test migration failure codepaths. That is moved to a new `failure-injection` feature flag with a bit of ceremony to stop us early if we're ever inadvertently packaging a propolis-server with `omicron-build` and `failure-injection`. Fixes #412. --- .github/buildomat/jobs/phd-build.sh | 5 +++- bin/propolis-server/Cargo.toml | 6 +++++ bin/propolis-server/src/lib/initializer.rs | 2 +- .../src/lib/migrate/preamble.rs | 4 ++-- .../src/lib/spec/api_spec_v0.rs | 12 +++++----- bin/propolis-server/src/lib/spec/builder.rs | 6 ++--- bin/propolis-server/src/lib/spec/mod.rs | 6 ++--- bin/propolis-server/src/lib/vm/ensure.rs | 4 +--- bin/propolis-server/src/main.rs | 13 +++++++++++ .../src/instance_spec/components/devices.rs | 4 ++-- lib/propolis/Cargo.toml | 2 ++ xtask/src/task_phd.rs | 23 +++++++++++++++---- 12 files changed, 61 insertions(+), 26 deletions(-) diff --git a/.github/buildomat/jobs/phd-build.sh b/.github/buildomat/jobs/phd-build.sh index 5caaacdc8..782e522c4 100644 --- a/.github/buildomat/jobs/phd-build.sh +++ b/.github/buildomat/jobs/phd-build.sh @@ -30,7 +30,10 @@ rustc --version # Build the Propolis server binary with 'dev' profile to enable assertions that # should fire during tests. banner build-propolis -ptime -m cargo build --verbose -p propolis-server + +export PHD_BUILD="true" +ptime -m cargo build --verbose -p propolis-server \ + --features omicron-build,failure-injection # The PHD runner requires unwind-on-panic to catch certain test failures, so # build it with the 'dev' profile which is so configured. diff --git a/bin/propolis-server/Cargo.toml b/bin/propolis-server/Cargo.toml index c748d4dfb..a3a07da86 100644 --- a/bin/propolis-server/Cargo.toml +++ b/bin/propolis-server/Cargo.toml @@ -83,3 +83,9 @@ omicron-build = ["propolis/omicron-build"] # Falcon builds require corresponding bits turned on in the dependency libs falcon = ["propolis/falcon"] + +# Testing necessitates injecting failures which should hopefully be rare or even +# never occur on real otherwise-unperturbed systems. We conditionally compile +# code supporting failure injection to avoid the risk of somehow injecting +# failures into a real system not under test. +failure-injection = ["propolis/failure-injection"] diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index a4ea795cb..7be62c8e1 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -814,7 +814,7 @@ impl MachineInitializer<'_> { Ok(()) } - #[cfg(not(feature = "omicron-build"))] + #[cfg(feature = "failure-injection")] pub fn initialize_test_devices(&mut self) { use propolis::hw::testdev::{ MigrationFailureDevice, MigrationFailures, diff --git a/bin/propolis-server/src/lib/migrate/preamble.rs b/bin/propolis-server/src/lib/migrate/preamble.rs index 4c6af125a..0201665d2 100644 --- a/bin/propolis-server/src/lib/migrate/preamble.rs +++ b/bin/propolis-server/src/lib/migrate/preamble.rs @@ -52,7 +52,7 @@ impl Preamble { }; match comp { - #[cfg(feature = "omicron-build")] + #[cfg(not(feature = "failure-injection"))] ReplacementComponent::MigrationFailureInjector(_) => { return Err(MigrateError::InstanceSpecsIncompatible( format!( @@ -62,7 +62,7 @@ impl Preamble { )); } - #[cfg(not(feature = "omicron-build"))] + #[cfg(feature = "failure-injection")] ReplacementComponent::MigrationFailureInjector(comp) => { let ComponentV0::MigrationFailureInjector(src) = to_amend else { diff --git a/bin/propolis-server/src/lib/spec/api_spec_v0.rs b/bin/propolis-server/src/lib/spec/api_spec_v0.rs index d97f982c5..18011c97c 100644 --- a/bin/propolis-server/src/lib/spec/api_spec_v0.rs +++ b/bin/propolis-server/src/lib/spec/api_spec_v0.rs @@ -27,7 +27,7 @@ use super::{ StorageDevice, }; -#[cfg(not(feature = "omicron-build"))] +#[cfg(feature = "failure-injection")] use super::MigrationFailure; #[cfg(feature = "falcon")] @@ -65,7 +65,7 @@ impl From for InstanceSpecV0 { serial, pci_pci_bridges, pvpanic, - #[cfg(not(feature = "omicron-build"))] + #[cfg(feature = "failure-injection")] migration_failure, #[cfg(feature = "falcon")] softnpu, @@ -157,7 +157,7 @@ impl From for InstanceSpecV0 { ); } - #[cfg(not(feature = "omicron-build"))] + #[cfg(feature = "failure-injection")] if let Some(mig) = migration_failure { insert_component( &mut spec, @@ -310,14 +310,14 @@ impl TryFrom for Spec { // apply it to the builder later. boot_settings = Some((device_id, settings)); } - #[cfg(feature = "omicron-build")] + #[cfg(not(feature = "failure-injection"))] ComponentV0::MigrationFailureInjector(_) => { return Err(ApiSpecError::FeatureCompiledOut { component: device_id, - feature: "omicron-build", + feature: "failure-injection", }); } - #[cfg(not(feature = "omicron-build"))] + #[cfg(feature = "failure-injection")] ComponentV0::MigrationFailureInjector(mig) => { builder.add_migration_failure_device(MigrationFailure { id: device_id, diff --git a/bin/propolis-server/src/lib/spec/builder.rs b/bin/propolis-server/src/lib/spec/builder.rs index c212790bd..ddd5c5e5b 100644 --- a/bin/propolis-server/src/lib/spec/builder.rs +++ b/bin/propolis-server/src/lib/spec/builder.rs @@ -26,7 +26,7 @@ use super::{ Board, BootOrderEntry, BootSettings, Disk, Nic, QemuPvpanic, SerialPort, }; -#[cfg(not(feature = "omicron-build"))] +#[cfg(feature = "failure-injection")] use super::MigrationFailure; #[cfg(feature = "falcon")] @@ -50,7 +50,7 @@ pub(crate) enum SpecBuilderError { #[error("pvpanic device already specified")] PvpanicInUse, - #[cfg(not(feature = "omicron-build"))] + #[cfg(feature = "failure-injection")] #[error("migration failure injection already enabled")] MigrationFailureInjectionInUse, @@ -269,7 +269,7 @@ impl SpecBuilder { Ok(self) } - #[cfg(not(feature = "omicron-build"))] + #[cfg(feature = "failure-injection")] pub fn add_migration_failure_device( &mut self, mig: MigrationFailure, diff --git a/bin/propolis-server/src/lib/spec/mod.rs b/bin/propolis-server/src/lib/spec/mod.rs index c6da82e67..5bf7f8836 100644 --- a/bin/propolis-server/src/lib/spec/mod.rs +++ b/bin/propolis-server/src/lib/spec/mod.rs @@ -34,7 +34,7 @@ use propolis_api_types::instance_spec::{ }; use thiserror::Error; -#[cfg(not(feature = "omicron-build"))] +#[cfg(feature = "failure-injection")] use propolis_api_types::instance_spec::components::devices::MigrationFailureInjector; #[cfg(feature = "falcon")] @@ -73,7 +73,7 @@ pub(crate) struct Spec { pub pci_pci_bridges: BTreeMap, pub pvpanic: Option, - #[cfg(not(feature = "omicron-build"))] + #[cfg(feature = "failure-injection")] pub migration_failure: Option, #[cfg(feature = "falcon")] @@ -285,7 +285,7 @@ pub struct QemuPvpanic { pub spec: QemuPvpanicDesc, } -#[cfg(not(feature = "omicron-build"))] +#[cfg(feature = "failure-injection")] #[derive(Clone, Debug)] pub struct MigrationFailure { pub id: SpecKey, diff --git a/bin/propolis-server/src/lib/vm/ensure.rs b/bin/propolis-server/src/lib/vm/ensure.rs index bef254f30..4ee2d1f23 100644 --- a/bin/propolis-server/src/lib/vm/ensure.rs +++ b/bin/propolis-server/src/lib/vm/ensure.rs @@ -564,10 +564,8 @@ async fn initialize_vm_objects( ))?; init.initialize_network_devices(&chipset).await?; - #[cfg(not(feature = "omicron-build"))] + #[cfg(feature = "failure-injection")] init.initialize_test_devices(); - #[cfg(feature = "omicron-build")] - info!(log, "`omicron-build` feature enabled, ignoring any test devices"); #[cfg(feature = "falcon")] { diff --git a/bin/propolis-server/src/main.rs b/bin/propolis-server/src/main.rs index a45718405..6e56186b1 100644 --- a/bin/propolis-server/src/main.rs +++ b/bin/propolis-server/src/main.rs @@ -278,6 +278,19 @@ fn main() -> anyhow::Result<()> { // Ensure proper setup of USDT probes register_probes().unwrap(); + #[cfg(all( + feature = "omicron-build", + any(feature = "failure-injection", feature = "falcon") + ))] + if option_env!("PHD_BUILD") != Some("true") { + panic!( + "`omicron-build` is enabled alongside development features, \ + this build is NOT SUITABLE for production. Set PHD_BUILD=true in \ + the environment and rebuild propolis-server if you really need \ + this to work." + ); + } + // Command line arguments. let args = Args::parse(); diff --git a/crates/propolis-api-types/src/instance_spec/components/devices.rs b/crates/propolis-api-types/src/instance_spec/components/devices.rs index 9dc86d031..7bb2d9fc5 100644 --- a/crates/propolis-api-types/src/instance_spec/components/devices.rs +++ b/crates/propolis-api-types/src/instance_spec/components/devices.rs @@ -193,8 +193,8 @@ pub struct P9fs { /// Describes a synthetic device that registers for VM lifecycle notifications /// and returns errors during attempts to migrate. /// -/// This is only supported by Propolis servers compiled without the -/// `omicron-build` feature. +/// This is only supported by Propolis servers compiled with the +/// `failure-injection` feature. #[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)] #[serde(deny_unknown_fields)] pub struct MigrationFailureInjector { diff --git a/lib/propolis/Cargo.toml b/lib/propolis/Cargo.toml index dc403b885..e8e687fb8 100644 --- a/lib/propolis/Cargo.toml +++ b/lib/propolis/Cargo.toml @@ -61,3 +61,5 @@ falcon = ["libloading", "p9ds", "dlpi", "ispf", "rand", "softnpu", "viona_api/fa # TODO until crucible#1280 is addressed, enabling Nexus notifications is done # through a feature flag. omicron-build = ["crucible/notify-nexus"] + +failure-injection = [] diff --git a/xtask/src/task_phd.rs b/xtask/src/task_phd.rs index 4cb6eaedd..8185295c1 100644 --- a/xtask/src/task_phd.rs +++ b/xtask/src/task_phd.rs @@ -4,7 +4,7 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; -use std::{fs, process::Command, time}; +use std::{collections::HashMap, fs, process::Command, time}; macro_rules! cargo_log { ($tag:literal, $($arg:tt)*) => { @@ -270,7 +270,7 @@ impl Cmd { return Ok(()); } Self::List { phd_args } => { - let phd_runner = build_bin("phd-runner", false)?; + let phd_runner = build_bin("phd-runner", false, None)?; let status = run_exit_code( phd_runner.command().arg("list").args(phd_args), )?; @@ -278,7 +278,7 @@ impl Cmd { } Self::RunnerHelp { phd_args } => { - let phd_runner = build_bin("phd-runner", false)?; + let phd_runner = build_bin("phd-runner", false, None)?; let status = run_exit_code( phd_runner.command().arg("help").args(phd_args), )?; @@ -292,7 +292,14 @@ impl Cmd { cmd } None => { - let bin = build_bin("propolis-server", propolis_args.release)?; + let mut server_build_env = HashMap::new(); + server_build_env + .insert("PHD_BUILD".to_string(), "true".to_string()); + let bin = build_bin( + "propolis-server", + propolis_args.release, + Some(server_build_env), + )?; let path = bin .path() .try_into() @@ -345,7 +352,7 @@ impl Cmd { anyhow::bail!("Missing artifacts config `{artifacts_toml}`!"); } - let phd_runner = build_bin("phd-runner", false)?; + let phd_runner = build_bin("phd-runner", false, None)?; let mut cmd = if cfg!(target_os = "illumos") { let mut cmd = Command::new("pfexec"); cmd.arg(phd_runner.path()); @@ -421,12 +428,18 @@ impl BasePropolisArgs { fn build_bin( name: impl AsRef, release: bool, + build_env: Option>, ) -> anyhow::Result { let name = name.as_ref(); cargo_log!("Compiling", "{name}"); let mut cmd = escargot::CargoBuild::new().package(name).bin(name).current_target(); + if let Some(env) = build_env { + for (k, v) in env { + cmd = cmd.env(k, v); + } + } let profile = if release { cmd = cmd.release(); "release [optimized]" From 9ad3276c604bd23f1cd1cd504bfa3df2cc77eb03 Mon Sep 17 00:00:00 2001 From: iximeow Date: Tue, 4 Mar 2025 21:05:44 +0000 Subject: [PATCH 2/7] ofc that's in the OpenAPI doc as well --- openapi/propolis-server.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openapi/propolis-server.json b/openapi/propolis-server.json index 09b3a590d..030abd887 100644 --- a/openapi/propolis-server.json +++ b/openapi/propolis-server.json @@ -1526,7 +1526,7 @@ ] }, "MigrationFailureInjector": { - "description": "Describes a synthetic device that registers for VM lifecycle notifications and returns errors during attempts to migrate.\n\nThis is only supported by Propolis servers compiled without the `omicron-build` feature.", + "description": "Describes a synthetic device that registers for VM lifecycle notifications and returns errors during attempts to migrate.\n\nThis is only supported by Propolis servers compiled with the `failure-injection` feature.", "type": "object", "properties": { "fail_exports": { From cf02de70e02d59ecb8fbf48fc8537c42fd6a6755 Mon Sep 17 00:00:00 2001 From: iximeow Date: Tue, 4 Mar 2025 21:28:22 +0000 Subject: [PATCH 3/7] lib/propolis does not need failure-injection, phd xtask does --- bin/propolis-server/Cargo.toml | 2 +- lib/propolis/Cargo.toml | 2 -- xtask/src/task_phd.rs | 21 ++++++++++++++++++--- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/bin/propolis-server/Cargo.toml b/bin/propolis-server/Cargo.toml index a3a07da86..98add8228 100644 --- a/bin/propolis-server/Cargo.toml +++ b/bin/propolis-server/Cargo.toml @@ -88,4 +88,4 @@ falcon = ["propolis/falcon"] # never occur on real otherwise-unperturbed systems. We conditionally compile # code supporting failure injection to avoid the risk of somehow injecting # failures into a real system not under test. -failure-injection = ["propolis/failure-injection"] +failure-injection = [] diff --git a/lib/propolis/Cargo.toml b/lib/propolis/Cargo.toml index e8e687fb8..dc403b885 100644 --- a/lib/propolis/Cargo.toml +++ b/lib/propolis/Cargo.toml @@ -61,5 +61,3 @@ falcon = ["libloading", "p9ds", "dlpi", "ispf", "rand", "softnpu", "viona_api/fa # TODO until crucible#1280 is addressed, enabling Nexus notifications is done # through a feature flag. omicron-build = ["crucible/notify-nexus"] - -failure-injection = [] diff --git a/xtask/src/task_phd.rs b/xtask/src/task_phd.rs index 8185295c1..8eb7f6cd8 100644 --- a/xtask/src/task_phd.rs +++ b/xtask/src/task_phd.rs @@ -270,7 +270,7 @@ impl Cmd { return Ok(()); } Self::List { phd_args } => { - let phd_runner = build_bin("phd-runner", false, None)?; + let phd_runner = build_bin("phd-runner", false, None, None)?; let status = run_exit_code( phd_runner.command().arg("list").args(phd_args), )?; @@ -278,7 +278,7 @@ impl Cmd { } Self::RunnerHelp { phd_args } => { - let phd_runner = build_bin("phd-runner", false, None)?; + let phd_runner = build_bin("phd-runner", false, None, None)?; let status = run_exit_code( phd_runner.command().arg("help").args(phd_args), )?; @@ -298,6 +298,12 @@ impl Cmd { let bin = build_bin( "propolis-server", propolis_args.release, + // Some PHD tests specifically cover cases where a component + // in the system has encountered an error, so enable + // failure-injection. Do not enable `omicron-build` like we + // do in Buildomat because someone running `cargo xtask phd` + // is not building a propolis-server destined for Omicron. + Some("failure-injection"), Some(server_build_env), )?; let path = bin @@ -352,7 +358,7 @@ impl Cmd { anyhow::bail!("Missing artifacts config `{artifacts_toml}`!"); } - let phd_runner = build_bin("phd-runner", false, None)?; + let phd_runner = build_bin("phd-runner", false, None, None)?; let mut cmd = if cfg!(target_os = "illumos") { let mut cmd = Command::new("pfexec"); cmd.arg(phd_runner.path()); @@ -425,9 +431,15 @@ impl BasePropolisArgs { } } +/// Build the binary `name` in debug or release with an optional build +/// environment variables and a list of Cargo features. +/// +/// `features` is passed directly to Cargo, and so must be a space or +/// comma-separated list of features to activate. fn build_bin( name: impl AsRef, release: bool, + features: Option<&str>, build_env: Option>, ) -> anyhow::Result { let name = name.as_ref(); @@ -435,6 +447,9 @@ fn build_bin( let mut cmd = escargot::CargoBuild::new().package(name).bin(name).current_target(); + if let Some(features) = features { + cmd = cmd.features(features); + } if let Some(env) = build_env { for (k, v) in env { cmd = cmd.env(k, v); From 23e42cf0e7899b843c508a0c2c969dc490cf8607 Mon Sep 17 00:00:00 2001 From: iximeow Date: Tue, 4 Mar 2025 23:55:23 +0000 Subject: [PATCH 4/7] configure a reservoir before doing phd in CI, now that we'll use it --- .github/buildomat/phd-run-with-args.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/buildomat/phd-run-with-args.sh b/.github/buildomat/phd-run-with-args.sh index d7b6bb463..506a35264 100755 --- a/.github/buildomat/phd-run-with-args.sh +++ b/.github/buildomat/phd-run-with-args.sh @@ -37,6 +37,18 @@ if [ ! -d "$tmpdir" ]; then mkdir $tmpdir fi +# We'll be using the reservoir, so set it to something higher than the default +# 0MiB. Most tests would only need 512MiB (the default PHD guest VM memory +# size), some tests want to run multiple VMs concurrently (particularly around +# migration). 4096MiB is an arbitrary number intended to support the above and +# that we might want to run those tests concurrently at some point. +# +# Currently the lab host these tests will run on is well-known and has much +# more memory than this. Hopefully we won't have Propolis CI running on a +# machine with ~4GiB of memory, but this number could be tuned down if the need +# arises. +/usr/lib/rsrvrctl -s 4096 + banner 'Tests' runner="$phddir/phd-runner" From b8729a97d7dfa077d0132cf3222521440cf765e5 Mon Sep 17 00:00:00 2001 From: iximeow Date: Wed, 5 Mar 2025 00:33:47 +0000 Subject: [PATCH 5/7] right, CI is not root --- .github/buildomat/phd-run-with-args.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/buildomat/phd-run-with-args.sh b/.github/buildomat/phd-run-with-args.sh index 506a35264..4ca389227 100755 --- a/.github/buildomat/phd-run-with-args.sh +++ b/.github/buildomat/phd-run-with-args.sh @@ -47,7 +47,7 @@ fi # more memory than this. Hopefully we won't have Propolis CI running on a # machine with ~4GiB of memory, but this number could be tuned down if the need # arises. -/usr/lib/rsrvrctl -s 4096 +pfexec /usr/lib/rsrvrctl -s 4096 banner 'Tests' From ebdee7334385694b6180a00c67102a54e83a5d3f Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 6 Mar 2025 17:23:33 -0800 Subject: [PATCH 6/7] describe the new "PHD_BUILD" variable in phd-build.sh Co-authored-by: Greg Colombo --- .github/buildomat/jobs/phd-build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/buildomat/jobs/phd-build.sh b/.github/buildomat/jobs/phd-build.sh index 782e522c4..2131c21fb 100644 --- a/.github/buildomat/jobs/phd-build.sh +++ b/.github/buildomat/jobs/phd-build.sh @@ -31,6 +31,8 @@ rustc --version # should fire during tests. banner build-propolis +# Compile propolis-server so that it allows development features to be used even +# though the `omicron-build` feature is enabled. export PHD_BUILD="true" ptime -m cargo build --verbose -p propolis-server \ --features omicron-build,failure-injection From 8affc6504ee4fc6663dd72e2339724d41b44c969 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 7 Mar 2025 03:11:59 +0000 Subject: [PATCH 7/7] add an --omicron-build flag to cargo xtask phd also --- xtask/src/task_phd.rs | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/xtask/src/task_phd.rs b/xtask/src/task_phd.rs index 8eb7f6cd8..8c47f972c 100644 --- a/xtask/src/task_phd.rs +++ b/xtask/src/task_phd.rs @@ -112,6 +112,16 @@ struct PropolisArgs { /// If set, build `propolis-server` in release mode. #[clap(long, short = 'r')] release: bool, + + /// If set, build `propolis-server` with `omicron-build`. This may enable + /// codepaths that only work on `stlouis`, or otherwise expect specific test + /// environment configuration (such as having a reservoir large enough to + /// allocate VMs). + /// + /// This can be helpful if you're debugging an issue that occurs in CI, + /// where `propolis-server` is also tested with `omicron-build`. + #[clap(long)] + omicron_build: bool, } #[derive(Debug, Clone, clap::Parser)] @@ -295,15 +305,22 @@ impl Cmd { let mut server_build_env = HashMap::new(); server_build_env .insert("PHD_BUILD".to_string(), "true".to_string()); + + // Some PHD tests specifically cover cases where a component in + // the system has encountered an error, so enable + // failure-injection. Do not enable `omicron-build` by default + // because it configures `propolis-server` to expect an Omicron- + // or stlouis-specific environment. + let mut propolis_features = vec!["failure-injection"]; + if propolis_args.omicron_build { + // If you know your environment looks like we'd expect in + // Omicron, have at it! + propolis_features.push("omicron-build"); + } let bin = build_bin( "propolis-server", propolis_args.release, - // Some PHD tests specifically cover cases where a component - // in the system has encountered an error, so enable - // failure-injection. Do not enable `omicron-build` like we - // do in Buildomat because someone running `cargo xtask phd` - // is not building a propolis-server destined for Omicron. - Some("failure-injection"), + Some(&propolis_features.join(",")), Some(server_build_env), )?; let path = bin