From a514531513148943e14d4a366c4fad439f30f452 Mon Sep 17 00:00:00 2001 From: iximeow Date: Fri, 7 Mar 2025 16:35:18 +0000 Subject: [PATCH 1/5] parallel phd! test run in 40 seconds or your money back --- Cargo.lock | 1 + phd-tests/runner/Cargo.toml | 1 + phd-tests/runner/src/config.rs | 6 + phd-tests/runner/src/execute.rs | 202 ++++++++++++++++++-------------- phd-tests/runner/src/main.rs | 118 +++++++++++++++---- 5 files changed, 215 insertions(+), 113 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b2203c19d..b9c574859 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4001,6 +4001,7 @@ dependencies = [ "camino", "cargo_metadata", "clap", + "crossbeam-channel", "phd-framework", "phd-tests", "tokio", diff --git a/phd-tests/runner/Cargo.toml b/phd-tests/runner/Cargo.toml index 57054e88f..c8847ec37 100644 --- a/phd-tests/runner/Cargo.toml +++ b/phd-tests/runner/Cargo.toml @@ -14,6 +14,7 @@ anyhow.workspace = true backtrace.workspace = true camino.workspace = true clap = { workspace = true, features = ["derive"] } +crossbeam-channel.workspace = true phd-framework.workspace = true phd-tests.workspace = true tokio = { workspace = true, features = ["full"] } diff --git a/phd-tests/runner/src/config.rs b/phd-tests/runner/src/config.rs index 1e152d08f..3edf351a4 100644 --- a/phd-tests/runner/src/config.rs +++ b/phd-tests/runner/src/config.rs @@ -156,6 +156,12 @@ pub struct RunOptions { #[clap(long, default_value = "file")] pub server_logging_mode: ServerLogMode, + /// The parallelism with which to run PHD tests. If not provided, phd-runner + /// will guess a reasonable number from the test environment's number of + /// CPUs and available memory. + #[clap(long, value_parser)] + pub parallelism: Option, + /// The number of CPUs to assign to the guest in tests where the test is /// using the default machine configuration. #[clap(long, value_parser, default_value = "2")] diff --git a/phd-tests/runner/src/execute.rs b/phd-tests/runner/src/execute.rs index 3126e3a31..72d5d0e5e 100644 --- a/phd-tests/runner/src/execute.rs +++ b/phd-tests/runner/src/execute.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; use phd_tests::phd_testcase::{Framework, TestCase, TestOutcome}; @@ -37,21 +37,9 @@ pub struct ExecutionStats { pub failed_test_cases: Vec<&'static TestCase>, } -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] -enum Status { - Ran(TestOutcome), - NotRun, -} - -struct Execution { - tc: &'static TestCase, - status: Status, -} - /// Executes a set of tests using the supplied test context. pub async fn run_tests_with_ctx( - ctx: &Arc, - mut fixtures: TestFixtures, + ctx: &mut Vec<(Arc, TestFixtures)>, run_opts: &RunOptions, ) -> ExecutionStats { let mut executions = Vec::new(); @@ -60,10 +48,10 @@ pub async fn run_tests_with_ctx( &run_opts.include_filter, &run_opts.exclude_filter, ) { - executions.push(Execution { tc, status: Status::NotRun }); + executions.push(tc); } - let mut stats = ExecutionStats { + let stats = ExecutionStats { tests_passed: 0, tests_failed: 0, tests_skipped: 0, @@ -77,90 +65,128 @@ pub async fn run_tests_with_ctx( return stats; } - fixtures.execution_setup().unwrap(); - let sigint_rx = set_sigint_handler(); - info!("Running {} test(s)", executions.len()); - let start_time = Instant::now(); - for execution in &mut executions { - if *sigint_rx.borrow() { - info!("Test run interrupted by SIGINT"); - break; - } + let stats = Arc::new(Mutex::new(stats)); - info!("Starting test {}", execution.tc.fully_qualified_name()); + async fn run_tests( + execution_rx: crossbeam_channel::Receiver<&'static TestCase>, + test_ctx: Arc, + mut fixtures: TestFixtures, + stats: Arc>, + sigint_rx: watch::Receiver, + ) -> Result<(), ()> { + fixtures.execution_setup().unwrap(); - // Failure to run a setup fixture is fatal to the rest of the run, but - // it's still possible to report results, so return gracefully instead - // of panicking. - if let Err(e) = fixtures.test_setup() { - error!("Error running test setup fixture: {}", e); - break; - } + loop { + // Check for SIGINT only at the top of the loop because while + // waiting for a new testcase is theoretically a blocking + // operation, it won't be in a meaningful way for our use. The + // recv() will return immediately because either there are more + // testcases to run or the sender is closed. The only long + // blocking operation to check against in this loop is the test + // run itself. + if *sigint_rx.borrow() { + info!("Test run interrupted by SIGINT"); + break; + } - stats.tests_not_run -= 1; - let test_ctx = ctx.clone(); - let tc = execution.tc; - let mut sigint_rx_task = sigint_rx.clone(); - let test_outcome = tokio::spawn(async move { - tokio::select! { - // Ensure interrupt signals are always handled instead of - // continuing to run the test. - biased; - result = sigint_rx_task.changed() => { - assert!( - result.is_ok(), - "SIGINT channel shouldn't drop while tests are running" - ); - - TestOutcome::Failed( - Some("test interrupted by SIGINT".to_string()) - ) + let tc = match execution_rx.recv() { + Ok(tc) => tc, + Err(_) => { + // RecvError means the channel is closed, so we're all + // done. + break; } - outcome = tc.run(test_ctx.as_ref()) => outcome + }; + + info!("Starting test {}", tc.fully_qualified_name()); + + // Failure to run a setup fixture is fatal to the rest of the + // run, but it's still possible to report results, so return + // gracefully instead of panicking. + if let Err(e) = fixtures.test_setup() { + error!("Error running test setup fixture: {}", e); + // TODO: set this on stats too + break; } - }) - .await - .unwrap_or_else(|_| { - TestOutcome::Failed(Some( - "test task panicked, see test logs".to_string(), - )) - }); - - info!( - "test {} ... {}{}", - execution.tc.fully_qualified_name(), - match test_outcome { - TestOutcome::Passed => "ok", - TestOutcome::Failed(_) => "FAILED: ", - TestOutcome::Skipped(_) => "skipped: ", - }, - match &test_outcome { - TestOutcome::Failed(Some(s)) - | TestOutcome::Skipped(Some(s)) => s, - TestOutcome::Failed(None) | TestOutcome::Skipped(None) => - "[no message]", - _ => "", + + { + let mut stats = stats.lock().unwrap(); + stats.tests_not_run -= 1; } - ); - match test_outcome { - TestOutcome::Passed => stats.tests_passed += 1, - TestOutcome::Failed(_) => { - stats.tests_failed += 1; - stats.failed_test_cases.push(execution.tc); + let test_outcome = tc.run(test_ctx.as_ref()).await; + + info!( + "test {} ... {}{}", + tc.fully_qualified_name(), + match test_outcome { + TestOutcome::Passed => "ok", + TestOutcome::Failed(_) => "FAILED: ", + TestOutcome::Skipped(_) => "skipped: ", + }, + match &test_outcome { + TestOutcome::Failed(Some(s)) + | TestOutcome::Skipped(Some(s)) => s, + TestOutcome::Failed(None) | TestOutcome::Skipped(None) => + "[no message]", + _ => "", + } + ); + + { + let mut stats = stats.lock().unwrap(); + match test_outcome { + TestOutcome::Passed => stats.tests_passed += 1, + TestOutcome::Failed(_) => { + stats.tests_failed += 1; + stats.failed_test_cases.push(tc); + } + TestOutcome::Skipped(_) => stats.tests_skipped += 1, + } } - TestOutcome::Skipped(_) => stats.tests_skipped += 1, - } - execution.status = Status::Ran(test_outcome); - if let Err(e) = fixtures.test_cleanup().await { - error!("Error running cleanup fixture: {}", e); - break; + if let Err(e) = fixtures.test_cleanup().await { + error!("Error running cleanup fixture: {}", e); + // TODO: set this on stats + break; + } } + + fixtures.execution_cleanup().unwrap(); + + Ok(()) } - stats.duration = start_time.elapsed(); - fixtures.execution_cleanup().unwrap(); + let sigint_rx = set_sigint_handler(); + info!("Running {} test(s)", executions.len()); + let start_time = Instant::now(); + + let (execution_tx, execution_rx) = + crossbeam_channel::unbounded::<&'static TestCase>(); + + let mut test_runners = tokio::task::JoinSet::new(); + + for (ctx, fixtures) in ctx.drain(..) { + test_runners.spawn(run_tests( + execution_rx.clone(), + ctx, + fixtures, + Arc::clone(&stats), + sigint_rx.clone(), + )); + } + + for execution in &mut executions { + execution_tx.send(execution).expect("ok"); + } + std::mem::drop(execution_tx); + + let _ = test_runners.join_all().await; + + let mut stats = + Mutex::into_inner(Arc::into_inner(stats).expect("only one ref")) + .expect("lock not panicked"); + stats.duration = start_time.elapsed(); stats } diff --git a/phd-tests/runner/src/main.rs b/phd-tests/runner/src/main.rs index 225f5a45b..42894a00e 100644 --- a/phd-tests/runner/src/main.rs +++ b/phd-tests/runner/src/main.rs @@ -47,35 +47,103 @@ async fn main() -> anyhow::Result<()> { } async fn run_tests(run_opts: &RunOptions) -> anyhow::Result { - let ctx_params = FrameworkParameters { - propolis_server_path: run_opts.propolis_server_cmd.clone(), - crucible_downstairs: run_opts.crucible_downstairs()?, - base_propolis: run_opts.base_propolis(), - tmp_directory: run_opts.tmp_directory.clone(), - artifact_directory: run_opts.artifact_directory(), - artifact_toml: run_opts.artifact_toml_path.clone(), - server_log_mode: run_opts.server_logging_mode, - default_guest_cpus: run_opts.default_guest_cpus, - default_guest_memory_mib: run_opts.default_guest_memory_mib, - default_guest_os_artifact: run_opts.default_guest_artifact.clone(), - default_bootrom_artifact: run_opts.default_bootrom_artifact.clone(), - port_range: 9000..10000, - max_buildomat_wait: Duration::from_secs( - run_opts.max_buildomat_wait_secs, - ), - }; - - let ctx = Arc::new( - Framework::new(ctx_params) - .await - .expect("should be able to set up a test context"), - ); + let parallelism = run_opts.parallelism.unwrap_or_else(|| { + // Assume no test starts more than 4 VMs. This is a really conservative + // guess to make sure we don't cause tests to fail simply because we ran + // too many at once. + let cpus_per_runner = run_opts.default_guest_cpus as u64 * 4; + let memory_mib_per_runner = run_opts.default_guest_memory_mib * 4; + + // I assume there's a smarter way to do this in illumos. sorry!! + let lgrpinfo = std::process::Command::new("/usr/bin/lgrpinfo") + .args(["-mc", "-u", "m"]) + .output() + .expect("can run lgrpinfo"); + assert_eq!(lgrpinfo.status.code(), Some(0)); + let lgrpinfo_output_str = + String::from_utf8(lgrpinfo.stdout).expect("utf8 output"); + let lines: Vec<&str> = lgrpinfo_output_str.split("\n").collect(); + let cpuline = lines[1]; + let cpu_range = cpuline + .strip_prefix("\tCPUS: ") + .expect("`CPUs` line starts as expected"); + let mut cpu_range_parts = cpu_range.split("-"); + let cpu_low = cpu_range_parts.next().expect("range has a low"); + let cpu_high = cpu_range_parts.next().expect("range has a high"); + let ncpus = cpu_high.parse::().expect("can parse cpu_low") + - cpu_low.parse::().expect("can parse cpu_high"); + + let lim_by_cpus = ncpus / cpus_per_runner; + + let memoryline = lines[2]; + let memory = memoryline + .strip_prefix("\tMemory: ") + .expect("`Memory` line starts as expected"); + let installed = memory + .split(",") + .next() + .expect("memory line is comma-separated elements"); + let installed_mb = installed + .strip_prefix("installed ") + .expect("memory line starts with installed") + .strip_suffix("M") + .expect("memory line ends with M") + .parse::() + .expect("can parse memory MB"); + + let lim_by_mem = installed_mb / memory_mib_per_runner; + + std::cmp::min(lim_by_cpus as u16, lim_by_mem as u16) + }); + + // /!\ Arbitrary choice warning /!\ + // + // We probably only need a half dozen ports at most for any test. 200 is an + // incredible overallocation to never have to worry about the problem. As + // long as we don't have hundreds of test runners running concurrently. + const PORT_RANGE_PER_RUNNER: u16 = 125; + + let mut runners = Vec::new(); + + // Create up to parallelism collections of PHD framework settings. For the + // most part we can reuse the same settings for all test-running tasks - + // state is not mutably shared. The important exception is port ranges for + // servers PHD will run, where we want non-overlapping port ranges to ensure + // that concurrent tests don't accidentally use a neighbor's servers. + for i in 0..parallelism { + let port_range = (9000 + PORT_RANGE_PER_RUNNER * i) + ..(9000 + PORT_RANGE_PER_RUNNER * (i + 1)); + let ctx_params = FrameworkParameters { + propolis_server_path: run_opts.propolis_server_cmd.clone(), + crucible_downstairs: run_opts.crucible_downstairs()?, + base_propolis: run_opts.base_propolis(), + tmp_directory: run_opts.tmp_directory.clone(), + artifact_directory: run_opts.artifact_directory(), + artifact_toml: run_opts.artifact_toml_path.clone(), + server_log_mode: run_opts.server_logging_mode, + default_guest_cpus: run_opts.default_guest_cpus, + default_guest_memory_mib: run_opts.default_guest_memory_mib, + default_guest_os_artifact: run_opts.default_guest_artifact.clone(), + default_bootrom_artifact: run_opts.default_bootrom_artifact.clone(), + port_range, + max_buildomat_wait: Duration::from_secs( + run_opts.max_buildomat_wait_secs, + ), + }; + + let ctx = Arc::new( + Framework::new(ctx_params) + .await + .expect("should be able to set up a test context"), + ); - let fixtures = TestFixtures::new(ctx.clone()).unwrap(); + let fixtures = TestFixtures::new(ctx.clone()).unwrap(); + runners.push((ctx, fixtures)); + } // Run the tests and print results. let execution_stats = - execute::run_tests_with_ctx(&ctx, fixtures, run_opts).await; + execute::run_tests_with_ctx(&mut runners, run_opts).await; if !execution_stats.failed_test_cases.is_empty() { println!("\nfailures:"); for tc in &execution_stats.failed_test_cases { From f9d25ca845714ef11824c1e7533f2b5b1c614375 Mon Sep 17 00:00:00 2001 From: iximeow Date: Sat, 8 Mar 2025 00:02:26 +0000 Subject: [PATCH 2/5] dont parse lgrpinfo for cpu/memory, be better about that --- Cargo.lock | 2 + crates/bhyve-api/src/lib.rs | 1 + phd-tests/runner/Cargo.toml | 2 + phd-tests/runner/src/main.rs | 156 ++++++++++++++++++++++++----------- 4 files changed, 113 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b9c574859..9031e5131 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3998,10 +3998,12 @@ version = "0.1.0" dependencies = [ "anyhow", "backtrace", + "bhyve_api 0.0.0", "camino", "cargo_metadata", "clap", "crossbeam-channel", + "libc", "phd-framework", "phd-tests", "tokio", diff --git a/crates/bhyve-api/src/lib.rs b/crates/bhyve-api/src/lib.rs index df4e32e28..e59af13fd 100644 --- a/crates/bhyve-api/src/lib.rs +++ b/crates/bhyve-api/src/lib.rs @@ -125,6 +125,7 @@ impl AsRawFd for VmmCtlFd { } } +#[derive(Debug)] pub enum ReservoirError { /// Resizing operation was interrupted, but if a non-zero chunk size was /// specified, one or more chunk-sized adjustments to the reservoir size may diff --git a/phd-tests/runner/Cargo.toml b/phd-tests/runner/Cargo.toml index c8847ec37..a07024ed3 100644 --- a/phd-tests/runner/Cargo.toml +++ b/phd-tests/runner/Cargo.toml @@ -12,9 +12,11 @@ doctest = false [dependencies] anyhow.workspace = true backtrace.workspace = true +bhyve_api.workspace = true camino.workspace = true clap = { workspace = true, features = ["derive"] } crossbeam-channel.workspace = true +libc.workspace = true phd-framework.workspace = true phd-tests.workspace = true tokio = { workspace = true, features = ["full"] } diff --git a/phd-tests/runner/src/main.rs b/phd-tests/runner/src/main.rs index 42894a00e..28828d575 100644 --- a/phd-tests/runner/src/main.rs +++ b/phd-tests/runner/src/main.rs @@ -6,6 +6,7 @@ mod config; mod execute; mod fixtures; +use anyhow::{bail, Context}; use clap::Parser; use config::{ListOptions, ProcessArgs, RunOptions}; use phd_tests::phd_testcase::{Framework, FrameworkParameters}; @@ -46,55 +47,114 @@ async fn main() -> anyhow::Result<()> { Ok(()) } +fn guess_max_reasonable_parallelism( + default_guest_cpus: u8, + default_guest_memory_mib: u64, +) -> anyhow::Result { + // Assume no test starts more than 3 VMs. This is a really conservative + // guess to make sure we don't cause tests to fail simply because we ran + // too many at once. + const MAX_VMS_GUESS: u64 = 3; + let cpus_per_runner = default_guest_cpus as u64 * MAX_VMS_GUESS; + let memory_mib_per_runner = + (default_guest_memory_mib * MAX_VMS_GUESS) as usize; + + /// Miniscule wrapper for `sysconf(3C)` calls that checks errors. + fn sysconf(cfg: i32) -> std::io::Result { + // Safety: sysconf is an FFI call but we don't change any system + // state, it won't cause unwinding, etc. + let res = unsafe { libc::sysconf(cfg) }; + // For the handful of variables that can be queried, the variable is + // defined and won't error. Technically if the variable is + // unsupported, `-1` is returned without changing `errno`. In such + // cases, returning errno might be misleading! + // + // Instead of trying to disambiguate this, and in the knowledge + // these calls as we make them should never fail, just fall back to + // a more general always-factually-correct. + if res == -1 { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("could not get sysconf({})", cfg), + )); + } + Ok(res) + } + + let online_cpus = sysconf(libc::_SC_NPROCESSORS_ONLN) + .expect("can get number of online processors") + as u64; + // We're assuming that the system running tests is relatively idle other + // than the test runner itself. Overprovisioning CPUs will make everyone + // sad but should not fail tests, at least... + let lim_by_cpus = online_cpus / cpus_per_runner; + + let ctl = bhyve_api::VmmCtlFd::open()?; + let reservoir = + ctl.reservoir_query().context("failed to query reservoir")?; + let mut vmm_mem_limit = reservoir.vrq_free_sz; + + // The reservoir will be 0MiB by default if the system has not been + // configured with a particular size. + if reservoir.vrq_alloc_sz == 0 { + // If the reservoir is not configured, we'll try to make do with + // system memory and implore someone to earmark memory for test VMs in + // the future. + let page_size: usize = sysconf(libc::_SC_PAGESIZE) + .expect("can get page size") + .try_into() + .expect("page size is reasonable"); + let total_pages: usize = sysconf(libc::_SC_PHYS_PAGES) + .expect("can get physical pages in the system") + .try_into() + .expect("physical page count is reasonable"); + + let installed_mb = page_size * total_pages; + // /!\ Arbitrary choice warning /!\ + // + // It would be a little rude to spawn so many VMs that we cause the + // system running tests to empty the whole ARC and swap. If there's no + // reservior, though, we're gonna use *some* amount of memory that isn't + // explicitly earmarked for bhyve, though. 1/4th is just a "feels ok" + // fraction. + vmm_mem_limit = installed_mb / 4; + + eprintln!( + "phd-runner sees the VMM reservior is unconfigured, and will use \ + up to 25% of system memory ({}MiB) for test VMs. Please consider \ + using `cargo run --bin rsrvrctl set ` to set aside \ + memory for test VMs.", + vmm_mem_limit + ); + } + + let lim_by_mem = vmm_mem_limit / memory_mib_per_runner; + + Ok(std::cmp::min(lim_by_cpus as u16, lim_by_mem as u16)) +} + async fn run_tests(run_opts: &RunOptions) -> anyhow::Result { - let parallelism = run_opts.parallelism.unwrap_or_else(|| { - // Assume no test starts more than 4 VMs. This is a really conservative - // guess to make sure we don't cause tests to fail simply because we ran - // too many at once. - let cpus_per_runner = run_opts.default_guest_cpus as u64 * 4; - let memory_mib_per_runner = run_opts.default_guest_memory_mib * 4; - - // I assume there's a smarter way to do this in illumos. sorry!! - let lgrpinfo = std::process::Command::new("/usr/bin/lgrpinfo") - .args(["-mc", "-u", "m"]) - .output() - .expect("can run lgrpinfo"); - assert_eq!(lgrpinfo.status.code(), Some(0)); - let lgrpinfo_output_str = - String::from_utf8(lgrpinfo.stdout).expect("utf8 output"); - let lines: Vec<&str> = lgrpinfo_output_str.split("\n").collect(); - let cpuline = lines[1]; - let cpu_range = cpuline - .strip_prefix("\tCPUS: ") - .expect("`CPUs` line starts as expected"); - let mut cpu_range_parts = cpu_range.split("-"); - let cpu_low = cpu_range_parts.next().expect("range has a low"); - let cpu_high = cpu_range_parts.next().expect("range has a high"); - let ncpus = cpu_high.parse::().expect("can parse cpu_low") - - cpu_low.parse::().expect("can parse cpu_high"); - - let lim_by_cpus = ncpus / cpus_per_runner; - - let memoryline = lines[2]; - let memory = memoryline - .strip_prefix("\tMemory: ") - .expect("`Memory` line starts as expected"); - let installed = memory - .split(",") - .next() - .expect("memory line is comma-separated elements"); - let installed_mb = installed - .strip_prefix("installed ") - .expect("memory line starts with installed") - .strip_suffix("M") - .expect("memory line ends with M") - .parse::() - .expect("can parse memory MB"); - - let lim_by_mem = installed_mb / memory_mib_per_runner; - - std::cmp::min(lim_by_cpus as u16, lim_by_mem as u16) - }); + let parallelism = if let Some(parallelism) = run_opts.parallelism { + if parallelism == 0 { + bail!("Parallelism of 0 was requested; cannot run tests under these conditions!"); + } + parallelism + } else { + let res = guess_max_reasonable_parallelism( + run_opts.default_guest_cpus, + run_opts.default_guest_memory_mib, + )?; + if res == 0 { + bail!( + "Inferred a parallelism of 0; this is probably because there \ + is not much available memory for test VMs? Consider checking \ + reservoir configuration." + ); + } + res + }; + + info!("running tests with max parallelism of {}", parallelism); // /!\ Arbitrary choice warning /!\ // From e0d4da0aa3d1e1786ea2854b5f2a73050b704e5c Mon Sep 17 00:00:00 2001 From: iximeow Date: Sat, 8 Mar 2025 04:41:56 +0000 Subject: [PATCH 3/5] fix a units error, log more about parallelism determination --- phd-tests/runner/src/main.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/phd-tests/runner/src/main.rs b/phd-tests/runner/src/main.rs index 28828d575..02bcc943c 100644 --- a/phd-tests/runner/src/main.rs +++ b/phd-tests/runner/src/main.rs @@ -88,6 +88,10 @@ fn guess_max_reasonable_parallelism( // than the test runner itself. Overprovisioning CPUs will make everyone // sad but should not fail tests, at least... let lim_by_cpus = online_cpus / cpus_per_runner; + info!( + "parallelism by cpu count: {} ({} / {})", + lim_by_cpus, online_cpus, cpus_per_runner + ); let ctl = bhyve_api::VmmCtlFd::open()?; let reservoir = @@ -109,7 +113,9 @@ fn guess_max_reasonable_parallelism( .try_into() .expect("physical page count is reasonable"); - let installed_mb = page_size * total_pages; + const MB: usize = 1024 * 1024; + + let installed_mb = page_size * total_pages / MB; // /!\ Arbitrary choice warning /!\ // // It would be a little rude to spawn so many VMs that we cause the @@ -119,7 +125,7 @@ fn guess_max_reasonable_parallelism( // fraction. vmm_mem_limit = installed_mb / 4; - eprintln!( + warn!( "phd-runner sees the VMM reservior is unconfigured, and will use \ up to 25% of system memory ({}MiB) for test VMs. Please consider \ using `cargo run --bin rsrvrctl set ` to set aside \ @@ -129,6 +135,10 @@ fn guess_max_reasonable_parallelism( } let lim_by_mem = vmm_mem_limit / memory_mib_per_runner; + info!( + "parallelism by memory: {} ({} / {})", + lim_by_mem, vmm_mem_limit, memory_mib_per_runner + ); Ok(std::cmp::min(lim_by_cpus as u16, lim_by_mem as u16)) } From 589a5ea03218eae5a59b44b95dc1aafe222d59be Mon Sep 17 00:00:00 2001 From: iximeow Date: Mon, 10 Mar 2025 19:35:55 +0000 Subject: [PATCH 4/5] this will probably work, but is rancid if phd-runner decides to run with parallelism > 1, the tmpdir and artifact directories, which phd-runner will write to from each framework, are rewritten to be unique per-framework. this means the directory layout changes based on available parallelism, and makes the `phd-run-with-args.sh` script somewhat more gross, and generally is pretty unfortunate. at this point it's just to demonstrate that something is possible, but this really deserves the requisite surgery to share one log dir and one artifact dir across the Frameworks. --- .github/buildomat/phd-run-with-args.sh | 14 ++++- Cargo.lock | 1 + crates/cpuid-utils/Cargo.toml | 1 + crates/cpuid-utils/src/host.rs | 4 +- phd-tests/runner/src/main.rs | 78 +++++++++++++++++++++++--- 5 files changed, 87 insertions(+), 11 deletions(-) diff --git a/.github/buildomat/phd-run-with-args.sh b/.github/buildomat/phd-run-with-args.sh index d7b6bb463..373e02446 100755 --- a/.github/buildomat/phd-run-with-args.sh +++ b/.github/buildomat/phd-run-with-args.sh @@ -64,8 +64,20 @@ set +e failcount=$? set -e +# Disable errexit again because we may try collecting logs from runners that ran +# no tests (in which case *.log doesn't expand and tar will fail to find a file +# by that literal name) +set +e tar -czvf /tmp/phd-tmp-files.tar.gz \ - -C /tmp/propolis-phd /tmp/propolis-phd/*.log + -C $tmpdir $tmpdir/*.log +for runnerdir in $tmpdir/runner-*; do + if [ -d "$runnerdir" ]; then + tar -rzvf /tmp/phd-tmp-files.tar.gz \ + -C $runnerdir $runnerdir/*.log + fi +done +set -e + exitcode=0 if [ $failcount -eq 0 ]; then diff --git a/Cargo.lock b/Cargo.lock index 9031e5131..17183cc9f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -777,6 +777,7 @@ dependencies = [ "propolis_types", "proptest", "thiserror 1.0.64", + "uuid", ] [[package]] diff --git a/crates/cpuid-utils/Cargo.toml b/crates/cpuid-utils/Cargo.toml index 096eb5bbc..1563ba5df 100644 --- a/crates/cpuid-utils/Cargo.toml +++ b/crates/cpuid-utils/Cargo.toml @@ -10,6 +10,7 @@ bhyve_api.workspace = true propolis_api_types = {workspace = true, optional = true} propolis_types.workspace = true thiserror.workspace = true +uuid.workspace = true [dev-dependencies] proptest.workspace = true diff --git a/crates/cpuid-utils/src/host.rs b/crates/cpuid-utils/src/host.rs index 539c03b3d..65ca95087 100644 --- a/crates/cpuid-utils/src/host.rs +++ b/crates/cpuid-utils/src/host.rs @@ -5,6 +5,7 @@ use bhyve_api::{VmmCtlFd, VmmFd}; use propolis_types::{CpuidIdent, CpuidValues, CpuidVendor}; use thiserror::Error; +use uuid::Uuid; use crate::{ bits::{ @@ -32,7 +33,8 @@ struct Vm(bhyve_api::VmmFd); impl Vm { fn new() -> Result { - let name = format!("cpuid-gen-{}", std::process::id()); + let name = + format!("cpuid-gen-{}-{}", std::process::id(), Uuid::new_v4()); let mut req = bhyve_api::vm_create_req::new(name.as_bytes()) .expect("valid VM name"); diff --git a/phd-tests/runner/src/main.rs b/phd-tests/runner/src/main.rs index 02bcc943c..e7590fa5d 100644 --- a/phd-tests/runner/src/main.rs +++ b/phd-tests/runner/src/main.rs @@ -175,14 +175,9 @@ async fn run_tests(run_opts: &RunOptions) -> anyhow::Result { let mut runners = Vec::new(); - // Create up to parallelism collections of PHD framework settings. For the - // most part we can reuse the same settings for all test-running tasks - - // state is not mutably shared. The important exception is port ranges for - // servers PHD will run, where we want non-overlapping port ranges to ensure - // that concurrent tests don't accidentally use a neighbor's servers. - for i in 0..parallelism { - let port_range = (9000 + PORT_RANGE_PER_RUNNER * i) - ..(9000 + PORT_RANGE_PER_RUNNER * (i + 1)); + if parallelism == 1 { + // If there's only one test runner, the provided config is by definition + // not going to have issues with parallelism. Consume it directly. let ctx_params = FrameworkParameters { propolis_server_path: run_opts.propolis_server_cmd.clone(), crucible_downstairs: run_opts.crucible_downstairs()?, @@ -195,7 +190,7 @@ async fn run_tests(run_opts: &RunOptions) -> anyhow::Result { default_guest_memory_mib: run_opts.default_guest_memory_mib, default_guest_os_artifact: run_opts.default_guest_artifact.clone(), default_bootrom_artifact: run_opts.default_bootrom_artifact.clone(), - port_range, + port_range: 9000..(9000 + PORT_RANGE_PER_RUNNER), max_buildomat_wait: Duration::from_secs( run_opts.max_buildomat_wait_secs, ), @@ -209,6 +204,71 @@ async fn run_tests(run_opts: &RunOptions) -> anyhow::Result { let fixtures = TestFixtures::new(ctx.clone()).unwrap(); runners.push((ctx, fixtures)); + } else { + // Create up to parallelism collections of PHD framework settings. Many + // settings can be reused directly, but some describe state the + // framework will mutate (causing ports to be allocated, fetching + // artifacts, etc). Take the provided config and make it safe for + // Frameworks to use independently. + // + // This implies downloading artifacts {parallelism} times, extracting + // artifacts redundantly for the different artifact directories, makes + // terrible use of the port range. It'd be much better to accept some + // owner of the shared state that each Framework can take as a + // parameter. + + // We'll separate directories by which Framework they're for in some + // cases, below. Those uniquifying names will be padded to all be the + // length of the longest uniquifier to help make it kind of legible.. + let pad_size = format!("{}", parallelism - 1).len(); + + for i in 0..parallelism { + let port_range = (9000 + PORT_RANGE_PER_RUNNER * i) + ..(9000 + PORT_RANGE_PER_RUNNER * (i + 1)); + let mut unshared_tmp_dir = run_opts.tmp_directory.clone(); + unshared_tmp_dir.push(format!("runner-{:pad_size$}", i)); + + let mut unshared_artifacts_dir = + run_opts.artifact_directory().clone(); + unshared_artifacts_dir.push(format!("runner-{:pad_size$}", i)); + + // The runner directories may well not exist, we just invented them + // after all. Create them so phd-runner can use them. This is awful. + // Sorry. + let _ = std::fs::create_dir(&unshared_tmp_dir); + let _ = std::fs::create_dir(&unshared_artifacts_dir); + + let ctx_params = FrameworkParameters { + propolis_server_path: run_opts.propolis_server_cmd.clone(), + crucible_downstairs: run_opts.crucible_downstairs()?, + base_propolis: run_opts.base_propolis(), + tmp_directory: unshared_tmp_dir, + artifact_directory: unshared_artifacts_dir, + artifact_toml: run_opts.artifact_toml_path.clone(), + server_log_mode: run_opts.server_logging_mode, + default_guest_cpus: run_opts.default_guest_cpus, + default_guest_memory_mib: run_opts.default_guest_memory_mib, + default_guest_os_artifact: run_opts + .default_guest_artifact + .clone(), + default_bootrom_artifact: run_opts + .default_bootrom_artifact + .clone(), + port_range, + max_buildomat_wait: Duration::from_secs( + run_opts.max_buildomat_wait_secs, + ), + }; + + let ctx = Arc::new( + Framework::new(ctx_params) + .await + .expect("should be able to set up a test context"), + ); + + let fixtures = TestFixtures::new(ctx.clone()).unwrap(); + runners.push((ctx, fixtures)); + } } // Run the tests and print results. From 5cc9a75b65c00b34ff85b2ced63591ed29bff8ec Mon Sep 17 00:00:00 2001 From: iximeow Date: Mon, 10 Mar 2025 20:06:51 +0000 Subject: [PATCH 5/5] clamp parallelism so at least CI passes good grief --- .github/buildomat/phd-run-with-args.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/buildomat/phd-run-with-args.sh b/.github/buildomat/phd-run-with-args.sh index 373e02446..bc23bc85a 100755 --- a/.github/buildomat/phd-run-with-args.sh +++ b/.github/buildomat/phd-run-with-args.sh @@ -54,6 +54,7 @@ args=( '--artifact-toml-path' $artifacts '--tmp-directory' $tmpdir '--artifact-directory' $artifactdir + '--parallelism' 2 $@ )