Skip to content

Commit 11a4cf8

Browse files
authored
Remove the Active → Faulted transition (#1260)
This PR removes the Active → Faulted transition when too many IO operations are in flight. See #1252 for details on why this is desirable! There are a bunch of changes that come along for the ride: - The backpressure equation is changed from quadratic to the form `1 / (1 - f)`, i.e. it now goes to infinity at our desired maximum value. These maximum values are set at 2x the fault for the Offline → Faulted transition. - Computing backpressure is now a standalone function on `BackpressureConfig`, so it can be unit tested - Unit tests are added to confirm backpressure properties - The `Offline` → `Faulted` transition happens at a reasonable timescale (> 1 sec, < 2 minutes) - Backpressure goes to ∞ as we approach these limits - Backpressure settings are tweaked based on these new requirements
1 parent 2e80400 commit 11a4cf8

File tree

10 files changed

+319
-183
lines changed

10 files changed

+319
-183
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ hex = "0.4"
5353
http = "0.2.12"
5454
httptest = "0.15.5"
5555
human_bytes = "0.4.3"
56+
humantime = "2.1.0"
5657
hyper = { version = "0.14", features = [ "full" ] }
5758
hyper-staticfile = "0.9"
5859
indicatif = { version = "0.17.8", features = ["rayon"] }

crutest/src/main.rs

+1-3
Original file line numberDiff line numberDiff line change
@@ -1836,9 +1836,7 @@ async fn generic_workload(
18361836
}
18371837

18381838
// Make use of dsc to stop and start a downstairs while sending IO. This
1839-
// should trigger the replay code path. The IO sent to the downstairs should
1840-
// be below the threshold of gone_too_long() so we don't end up faulting the
1841-
// downstairs and doing a live repair
1839+
// should trigger the replay code path.
18421840
async fn replay_workload(
18431841
guest: &Arc<Guest>,
18441842
wtq: &mut WhenToQuit,

downstairs/src/lib.rs

+12-13
Original file line numberDiff line numberDiff line change
@@ -1140,13 +1140,13 @@ where
11401140
* │ │
11411141
* │ ┌────────────────┴────────────────────┐
11421142
* │ │ framed_write_task │
1143-
* │ └──────▲──────────────────▲──────────┘
1144-
* │ │ │
1145-
* │ ping│ │invalid
1146-
* │ ┌────────┘ │frame │responses
1147-
* │ │errors
1148-
* │ │ │
1149-
* ┌────▼───┐ message ┌┴──────┐ job ┌┴────────┐
1143+
* │ └──────▲──────────────────▲──────────┘
1144+
* │ │ │
1145+
* │ │invalid frame
1146+
* │ │errors, pings │responses
1147+
* │
1148+
* │ │ │
1149+
* ┌────▼───┐ message ┌┴──────┐ job ┌┴────────┐
11501150
* │resp_loop├──────────►│pf_task├─────────►│ dw_task │
11511151
* └─────────┘ channel └──┬────┘ channel └▲────────┘
11521152
* │ │
@@ -1308,12 +1308,7 @@ where
13081308
return Ok(());
13091309
}
13101310
Some(Ok(msg)) => {
1311-
if matches!(msg, Message::Ruok) {
1312-
// Respond instantly to pings, don't wait.
1313-
if let Err(e) = resp_channel_tx.send(Message::Imok) {
1314-
bail!("Failed sending Imok: {}", e);
1315-
}
1316-
} else if let Err(e) = message_channel_tx.send(msg) {
1311+
if let Err(e) = message_channel_tx.send(msg) {
13171312
bail!("Failed sending message to proc_frame: {}", e);
13181313
}
13191314
}
@@ -2660,6 +2655,10 @@ impl Downstairs {
26602655
resp_tx.send(msg)?;
26612656
None
26622657
}
2658+
Message::Ruok => {
2659+
resp_tx.send(Message::Imok)?;
2660+
None
2661+
}
26632662
x => bail!("unexpected frame {:?}", x),
26642663
};
26652664
Ok(r)

upstairs/Cargo.toml

+4-3
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,13 @@ http = { workspace = true, optional = true }
6262

6363
[dev-dependencies]
6464
expectorate.workspace = true
65-
openapiv3.workspace = true
65+
humantime.workspace = true
6666
openapi-lint.workspace = true
67-
tokio-test.workspace = true
67+
openapiv3.workspace = true
68+
proptest.workspace = true
6869
tempfile.workspace = true
6970
test-strategy.workspace = true
70-
proptest.workspace = true
71+
tokio-test.workspace = true
7172

7273
[build-dependencies]
7374
version_check = "0.9.4"

upstairs/src/client.rs

+4
Original file line numberDiff line numberDiff line change
@@ -2506,6 +2506,10 @@ impl ClientIoTask {
25062506
// If we're reconnecting, then add a short delay to avoid constantly
25072507
// spinning (e.g. if something is fundamentally wrong with the
25082508
// Downstairs)
2509+
//
2510+
// The upstairs can still stop us here, e.g. if we need to transition
2511+
// from Offline -> Faulted because we hit a job limit, that bounces the
2512+
// IO task (whether it *should* is debatable).
25092513
if self.delay {
25102514
tokio::select! {
25112515
s = &mut self.stop => {

0 commit comments

Comments
 (0)