Skip to content

Commit 5319622

Browse files
committed
review feedback, fix one more "idle" accounting test race
if there's a significant delay between the initial idle kstats observation becoming available and idle_start being measured, it's possible that the guest's actual idle vCPU time is greater than the "measured" idle time from PHD's view. instead, measure max idle time as "the time from before initial measurement to after last measurement", and min idle time as "the time just after initial measurement to after last measurement". still bias min idle time down a bit since the guest may have some active CPU time.
1 parent cea54ec commit 5319622

File tree

3 files changed

+56
-45
lines changed

3 files changed

+56
-45
lines changed

Cargo.lock

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

phd-tests/tests/Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ doctest = false
99

1010
[dependencies]
1111
anyhow.workspace = true
12+
backoff.workspace = true
1213
byteorder.workspace = true
1314
chrono.workspace = true
1415
cpuid_utils.workspace = true
@@ -24,6 +25,7 @@ propolis-client.workspace = true
2425
reqwest.workspace = true
2526
slog-term.workspace = true
2627
slog.workspace = true
28+
strum.workspace = true
2729
tokio.workspace = true
2830
tracing.workspace = true
2931
uuid.workspace = true

phd-tests/tests/src/stats.rs

+52-45
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,11 @@
33
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
44

55
use std::collections::HashMap;
6+
use std::str::FromStr;
67
use std::sync::Arc;
78
use std::sync::Mutex;
89
use std::time::Duration;
910

10-
use phd_testcase::*;
11-
use tracing::trace;
12-
use uuid::Uuid;
13-
1411
use chrono::{DateTime, Utc};
1512
use dropshot::endpoint;
1613
use dropshot::ApiDescription;
@@ -26,8 +23,11 @@ use omicron_common::api::internal::nexus::ProducerKind;
2623
use omicron_common::api::internal::nexus::ProducerRegistrationResponse;
2724
use oximeter::types::{ProducerResults, ProducerResultsItem, Sample};
2825
use oximeter::{Datum, FieldValue};
26+
use phd_testcase::*;
2927
use slog::Drain;
3028
use slog::Logger;
29+
use tracing::trace;
30+
use uuid::Uuid;
3131

3232
fn test_logger() -> Logger {
3333
let dec = slog_term::PlainSyncDecorator::new(slog_term::TestStdoutWriter);
@@ -52,28 +52,15 @@ struct FakeNexusContext {
5252
sampler: Arc<Mutex<Option<PropolisOximeterSampler>>>,
5353
}
5454

55-
#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)]
55+
#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq, strum::EnumString)]
56+
#[strum(serialize_all = "snake_case")]
5657
enum VcpuState {
5758
Emulation,
5859
Run,
5960
Idle,
6061
Waiting,
6162
}
6263

63-
impl VcpuState {
64-
fn from_oximeter_state_name(name: &str) -> Self {
65-
match name {
66-
"emulation" => VcpuState::Emulation,
67-
"run" => VcpuState::Run,
68-
"idle" => VcpuState::Idle,
69-
"waiting" => VcpuState::Waiting,
70-
other => {
71-
panic!("unknown Oximeter vpcu state name: {}", other);
72-
}
73-
}
74-
}
75-
}
76-
7764
#[derive(Default)]
7865
struct VcpuUsageMetric {
7966
metrics: HashMap<VcpuState, u64>,
@@ -139,7 +126,7 @@ impl VirtualMachineMetrics {
139126
let amount = if let Datum::CumulativeU64(amount) = datum {
140127
amount.value()
141128
} else {
142-
panic!("unexpected reset value type");
129+
panic!("unexpected reset datum type: {:?}", datum);
143130
};
144131
self.reset = Some(amount);
145132
self.update_metric_times(last_sample.measurement.timestamp());
@@ -151,12 +138,15 @@ impl VirtualMachineMetrics {
151138
panic!("unexpected vcpu_usage datum type: {:?}", datum);
152139
};
153140
let field = &fields["state"];
154-
let state: VcpuState =
155-
if let FieldValue::String(state) = &field.value {
156-
VcpuState::from_oximeter_state_name(state.as_ref())
157-
} else {
158-
panic!("unknown vcpu state datum type: {:?}", field);
159-
};
141+
let state: VcpuState = if let FieldValue::String(state) =
142+
&field.value
143+
{
144+
VcpuState::from_str(state.as_ref()).unwrap_or_else(|_| {
145+
panic!("unknown Oximeter vpcu state name: {}", state);
146+
})
147+
} else {
148+
panic!("unknown vcpu state datum type: {:?}", field);
149+
};
160150
let field = &fields["vcpu_id"];
161151
let vcpu_id = if let FieldValue::U32(vcpu_id) = field.value {
162152
vcpu_id
@@ -203,33 +193,40 @@ impl FakeNexusContext {
203193
return;
204194
}
205195
}
206-
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
196+
tokio::time::sleep(Duration::from_millis(100)).await;
207197
}
208198
}
209199

210200
/// Sample Propolis' Oximeter metrics, waiting up to a few seconds so that
211201
/// all measurements are from the time this function was called or later.
212202
async fn wait_for_propolis_stats(&self) -> VirtualMachineMetrics {
213-
let retry_delay = Duration::from_millis(1000);
214-
let max_wait = Duration::from_millis(10000);
215-
let wait_start = std::time::SystemTime::now();
216-
217203
let min_metric_time = Utc::now();
218204

219-
while wait_start.elapsed().expect("time goes forward") < max_wait {
220-
if let Some(metrics) = self.sample_propolis_stats().await {
221-
if metrics.oldest_time >= min_metric_time {
222-
return metrics;
205+
let result = backoff::future::retry(
206+
backoff::ExponentialBackoff {
207+
max_interval: Duration::from_secs(1),
208+
max_elapsed_time: Some(Duration::from_secs(10)),
209+
..Default::default()
210+
},
211+
|| async {
212+
if let Some(metrics) = self.sample_propolis_stats().await {
213+
if metrics.oldest_time >= min_metric_time {
214+
Ok(metrics)
215+
} else {
216+
Err(backoff::Error::transient(anyhow::anyhow!(
217+
"sampled metrics are not recent enough"
218+
)))
219+
}
220+
} else {
221+
Err(backoff::Error::transient(anyhow::anyhow!(
222+
"full metrics sample not available (yet?)"
223+
)))
223224
}
224-
}
225+
},
226+
)
227+
.await;
225228

226-
tokio::time::sleep(retry_delay).await;
227-
}
228-
229-
panic!(
230-
"propolis-server Oximeter stats unavailable? waited {:?}",
231-
max_wait
232-
);
229+
result.expect("propolis-server Oximeter stats should become available")
233230
}
234231

235232
/// Sample Propolis' Oximeter metrics, including the timestamp of the oldest
@@ -415,6 +412,15 @@ async fn instance_vcpu_stats(ctx: &Framework) {
415412
// The guesswork to validate that doesn't seem great in the face of
416413
// variable-time CI. We'll validate idle time measurements separately,
417414
// below.
415+
416+
// Idle time boundaries are a little differnt than running time boundaries
417+
// because it's more difficult to stop counting to idle vCPU time than it is
418+
// to stop counting running vCPU time. Instead, the maximum amount of idling
419+
// time we might measure is however long it takes to get the initial kstat
420+
// readings, plus how long the idle time takes, plus however long it takes
421+
// to get final kstat readings. The miminum amount of idling time is
422+
// the time elapsed since just after the initial kstat readings.
423+
let max_idle_start = std::time::SystemTime::now();
418424
let idle_start_metrics =
419425
fake_nexus.app_private().wait_for_propolis_stats().await;
420426
let idle_start = std::time::SystemTime::now();
@@ -427,6 +433,7 @@ async fn instance_vcpu_stats(ctx: &Framework) {
427433
// could introduce as much as a full Oximeter sample interval of additional
428434
// idle vCPU, and is we why wait to measure idle time until *after* getting
429435
// new Oximeter metrics.
436+
let max_idle_time = max_idle_start.elapsed().expect("time goes forwards");
430437
let idle_time = idle_start.elapsed().expect("time goes forwards");
431438
trace!("measured idle time {:?}", idle_time);
432439

@@ -437,13 +444,13 @@ async fn instance_vcpu_stats(ctx: &Framework) {
437444
// We've idled for at least 20 seconds. The guest may not be fully idle (its
438445
// OS is still running on its sole CPU, for example), so we test that the
439446
// guest was just mostly idle for the time period.
440-
let min_guest_idle_delta = (idle_time.as_nanos() as f64 * 0.9) as u128;
441447
assert!(
442-
idle_delta < idle_time.as_nanos(),
448+
idle_delta < max_idle_time.as_nanos(),
443449
"{} < {}",
444450
idle_delta as f64 / NANOS_PER_SEC,
445451
idle_time.as_nanos() as f64 / NANOS_PER_SEC
446452
);
453+
let min_guest_idle_delta = (idle_time.as_nanos() as f64 * 0.9) as u128;
447454
assert!(
448455
idle_delta > min_guest_idle_delta,
449456
"{} > {}",

0 commit comments

Comments
 (0)