3
3
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4
4
5
5
use std:: collections:: HashMap ;
6
+ use std:: str:: FromStr ;
6
7
use std:: sync:: Arc ;
7
8
use std:: sync:: Mutex ;
8
9
use std:: time:: Duration ;
9
10
10
- use phd_testcase:: * ;
11
- use tracing:: trace;
12
- use uuid:: Uuid ;
13
-
14
11
use chrono:: { DateTime , Utc } ;
15
12
use dropshot:: endpoint;
16
13
use dropshot:: ApiDescription ;
@@ -26,8 +23,11 @@ use omicron_common::api::internal::nexus::ProducerKind;
26
23
use omicron_common:: api:: internal:: nexus:: ProducerRegistrationResponse ;
27
24
use oximeter:: types:: { ProducerResults , ProducerResultsItem , Sample } ;
28
25
use oximeter:: { Datum , FieldValue } ;
26
+ use phd_testcase:: * ;
29
27
use slog:: Drain ;
30
28
use slog:: Logger ;
29
+ use tracing:: trace;
30
+ use uuid:: Uuid ;
31
31
32
32
fn test_logger ( ) -> Logger {
33
33
let dec = slog_term:: PlainSyncDecorator :: new ( slog_term:: TestStdoutWriter ) ;
@@ -52,28 +52,15 @@ struct FakeNexusContext {
52
52
sampler : Arc < Mutex < Option < PropolisOximeterSampler > > > ,
53
53
}
54
54
55
- #[ derive( Copy , Clone , Debug , Eq , Hash , PartialEq ) ]
55
+ #[ derive( Copy , Clone , Debug , Eq , Hash , PartialEq , strum:: EnumString ) ]
56
+ #[ strum( serialize_all = "snake_case" ) ]
56
57
enum VcpuState {
57
58
Emulation ,
58
59
Run ,
59
60
Idle ,
60
61
Waiting ,
61
62
}
62
63
63
- impl VcpuState {
64
- fn from_oximeter_state_name ( name : & str ) -> Self {
65
- match name {
66
- "emulation" => VcpuState :: Emulation ,
67
- "run" => VcpuState :: Run ,
68
- "idle" => VcpuState :: Idle ,
69
- "waiting" => VcpuState :: Waiting ,
70
- other => {
71
- panic ! ( "unknown Oximeter vpcu state name: {}" , other) ;
72
- }
73
- }
74
- }
75
- }
76
-
77
64
#[ derive( Default ) ]
78
65
struct VcpuUsageMetric {
79
66
metrics : HashMap < VcpuState , u64 > ,
@@ -139,7 +126,7 @@ impl VirtualMachineMetrics {
139
126
let amount = if let Datum :: CumulativeU64 ( amount) = datum {
140
127
amount. value ( )
141
128
} else {
142
- panic ! ( "unexpected reset value type" ) ;
129
+ panic ! ( "unexpected reset datum type: {:?}" , datum ) ;
143
130
} ;
144
131
self . reset = Some ( amount) ;
145
132
self . update_metric_times ( last_sample. measurement . timestamp ( ) ) ;
@@ -151,12 +138,15 @@ impl VirtualMachineMetrics {
151
138
panic ! ( "unexpected vcpu_usage datum type: {:?}" , datum) ;
152
139
} ;
153
140
let field = & fields[ "state" ] ;
154
- let state: VcpuState =
155
- if let FieldValue :: String ( state) = & field. value {
156
- VcpuState :: from_oximeter_state_name ( state. as_ref ( ) )
157
- } else {
158
- panic ! ( "unknown vcpu state datum type: {:?}" , field) ;
159
- } ;
141
+ let state: VcpuState = if let FieldValue :: String ( state) =
142
+ & field. value
143
+ {
144
+ VcpuState :: from_str ( state. as_ref ( ) ) . unwrap_or_else ( |_| {
145
+ panic ! ( "unknown Oximeter vpcu state name: {}" , state) ;
146
+ } )
147
+ } else {
148
+ panic ! ( "unknown vcpu state datum type: {:?}" , field) ;
149
+ } ;
160
150
let field = & fields[ "vcpu_id" ] ;
161
151
let vcpu_id = if let FieldValue :: U32 ( vcpu_id) = field. value {
162
152
vcpu_id
@@ -203,33 +193,40 @@ impl FakeNexusContext {
203
193
return ;
204
194
}
205
195
}
206
- tokio:: time:: sleep ( std :: time :: Duration :: from_millis ( 100 ) ) . await ;
196
+ tokio:: time:: sleep ( Duration :: from_millis ( 100 ) ) . await ;
207
197
}
208
198
}
209
199
210
200
/// Sample Propolis' Oximeter metrics, waiting up to a few seconds so that
211
201
/// all measurements are from the time this function was called or later.
212
202
async fn wait_for_propolis_stats ( & self ) -> VirtualMachineMetrics {
213
- let retry_delay = Duration :: from_millis ( 1000 ) ;
214
- let max_wait = Duration :: from_millis ( 10000 ) ;
215
- let wait_start = std:: time:: SystemTime :: now ( ) ;
216
-
217
203
let min_metric_time = Utc :: now ( ) ;
218
204
219
- while wait_start. elapsed ( ) . expect ( "time goes forward" ) < max_wait {
220
- if let Some ( metrics) = self . sample_propolis_stats ( ) . await {
221
- if metrics. oldest_time >= min_metric_time {
222
- return metrics;
205
+ let result = backoff:: future:: retry (
206
+ backoff:: ExponentialBackoff {
207
+ max_interval : Duration :: from_secs ( 1 ) ,
208
+ max_elapsed_time : Some ( Duration :: from_secs ( 10 ) ) ,
209
+ ..Default :: default ( )
210
+ } ,
211
+ || async {
212
+ if let Some ( metrics) = self . sample_propolis_stats ( ) . await {
213
+ if metrics. oldest_time >= min_metric_time {
214
+ Ok ( metrics)
215
+ } else {
216
+ Err ( backoff:: Error :: transient ( anyhow:: anyhow!(
217
+ "sampled metrics are not recent enough"
218
+ ) ) )
219
+ }
220
+ } else {
221
+ Err ( backoff:: Error :: transient ( anyhow:: anyhow!(
222
+ "full metrics sample not available (yet?)"
223
+ ) ) )
223
224
}
224
- }
225
+ } ,
226
+ )
227
+ . await ;
225
228
226
- tokio:: time:: sleep ( retry_delay) . await ;
227
- }
228
-
229
- panic ! (
230
- "propolis-server Oximeter stats unavailable? waited {:?}" ,
231
- max_wait
232
- ) ;
229
+ result. expect ( "propolis-server Oximeter stats should become available" )
233
230
}
234
231
235
232
/// Sample Propolis' Oximeter metrics, including the timestamp of the oldest
@@ -415,6 +412,15 @@ async fn instance_vcpu_stats(ctx: &Framework) {
415
412
// The guesswork to validate that doesn't seem great in the face of
416
413
// variable-time CI. We'll validate idle time measurements separately,
417
414
// below.
415
+
416
+ // Idle time boundaries are a little differnt than running time boundaries
417
+ // because it's more difficult to stop counting to idle vCPU time than it is
418
+ // to stop counting running vCPU time. Instead, the maximum amount of idling
419
+ // time we might measure is however long it takes to get the initial kstat
420
+ // readings, plus how long the idle time takes, plus however long it takes
421
+ // to get final kstat readings. The miminum amount of idling time is
422
+ // the time elapsed since just after the initial kstat readings.
423
+ let max_idle_start = std:: time:: SystemTime :: now ( ) ;
418
424
let idle_start_metrics =
419
425
fake_nexus. app_private ( ) . wait_for_propolis_stats ( ) . await ;
420
426
let idle_start = std:: time:: SystemTime :: now ( ) ;
@@ -427,6 +433,7 @@ async fn instance_vcpu_stats(ctx: &Framework) {
427
433
// could introduce as much as a full Oximeter sample interval of additional
428
434
// idle vCPU, and is we why wait to measure idle time until *after* getting
429
435
// new Oximeter metrics.
436
+ let max_idle_time = max_idle_start. elapsed ( ) . expect ( "time goes forwards" ) ;
430
437
let idle_time = idle_start. elapsed ( ) . expect ( "time goes forwards" ) ;
431
438
trace ! ( "measured idle time {:?}" , idle_time) ;
432
439
@@ -437,13 +444,13 @@ async fn instance_vcpu_stats(ctx: &Framework) {
437
444
// We've idled for at least 20 seconds. The guest may not be fully idle (its
438
445
// OS is still running on its sole CPU, for example), so we test that the
439
446
// guest was just mostly idle for the time period.
440
- let min_guest_idle_delta = ( idle_time. as_nanos ( ) as f64 * 0.9 ) as u128 ;
441
447
assert ! (
442
- idle_delta < idle_time . as_nanos( ) ,
448
+ idle_delta < max_idle_time . as_nanos( ) ,
443
449
"{} < {}" ,
444
450
idle_delta as f64 / NANOS_PER_SEC ,
445
451
idle_time. as_nanos( ) as f64 / NANOS_PER_SEC
446
452
) ;
453
+ let min_guest_idle_delta = ( idle_time. as_nanos ( ) as f64 * 0.9 ) as u128 ;
447
454
assert ! (
448
455
idle_delta > min_guest_idle_delta,
449
456
"{} > {}" ,
0 commit comments