Skip to content

Commit 45e24ee

Browse files
authored
Make backpressure ignore Offline downstairs (#1431)
One of my complaints in #1258 is that the `Offline` → `Faulted` transition depends on reaching a certain number of jobs in the queue, but accumulating those jobs is subject to backpressure. This has a few awkward consequences: - The time it takes to transition from `Offline` → `Faulted` depends on backpressure tuning, and we have to pick backpressure parameters that make this take a reasonable amount of time (tested in `check_offline_timeout`) - The offline Downstairs slows down the other Downstairs based on the number of jobs in its queue, meaning the system as a whole gets slower and slower until that Downstairs is marked as faulted This PR tweaks the backpressure implementation so that only `Active` Downstairs are counted. To do so, we move the `write_bytes_outstanding` to a per-client field. The backpressure implementation is also moved to a separate module, since we'll be adding more to it in the future. The `check_offline_timeout` test is no longer relevant and has been removed.
1 parent 6005b53 commit 45e24ee

File tree

5 files changed

+93
-133
lines changed

5 files changed

+93
-133
lines changed

upstairs/src/backpressure.rs

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// Copyright 2024 Oxide Computer Company
2+
3+
use crate::{ClientId, DownstairsIO, IOop};
4+
5+
/// Helper struct to contain a count of backpressure bytes
6+
#[derive(Debug)]
7+
pub struct BackpressureBytes(u64);
8+
9+
impl BackpressureBytes {
10+
pub fn new() -> Self {
11+
BackpressureBytes(0)
12+
}
13+
14+
pub fn get(&self) -> u64 {
15+
self.0
16+
}
17+
18+
/// Ensures that the given `DownstairsIO` is counted for backpressure
19+
///
20+
/// This is idempotent: if the job has already been counted, indicated by
21+
/// the `DownstairsIO::backpressure_bytes[c]` member being `Some(..)`, it
22+
/// will not be counted again.
23+
pub fn increment(&mut self, io: &mut DownstairsIO, c: ClientId) {
24+
match &io.work {
25+
IOop::Write { data, .. } | IOop::WriteUnwritten { data, .. } => {
26+
if !io.backpressure_bytes.contains(&c) {
27+
let n = data.len() as u64;
28+
io.backpressure_bytes.insert(c, n);
29+
self.0 += n;
30+
}
31+
}
32+
_ => (),
33+
};
34+
}
35+
36+
/// Remove the given job's contribution to backpressure
37+
///
38+
/// This is idempotent: `DownstairsIO::backpressure_bytes[c]` is set to
39+
/// `None` by this function call, so it's harmless to call repeatedly.
40+
pub fn decrement(&mut self, io: &mut DownstairsIO, c: ClientId) {
41+
if let Some(n) = io.backpressure_bytes.take(&c) {
42+
self.0 = self.0.checked_sub(n).unwrap();
43+
}
44+
}
45+
}

upstairs/src/client.rs

+13-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
// Copyright 2023 Oxide Computer Company
22
use crate::{
3-
cdt, integrity_hash, live_repair::ExtentInfo, upstairs::UpstairsConfig,
4-
upstairs::UpstairsState, ClientIOStateCount, ClientId, CrucibleDecoder,
5-
CrucibleError, DownstairsIO, DsState, EncryptionContext, IOState, IOop,
6-
JobId, Message, RawReadResponse, ReconcileIO, RegionDefinitionStatus,
7-
RegionMetadata, Validation,
3+
backpressure::BackpressureBytes, cdt, integrity_hash,
4+
live_repair::ExtentInfo, upstairs::UpstairsConfig, upstairs::UpstairsState,
5+
ClientIOStateCount, ClientId, CrucibleDecoder, CrucibleError, DownstairsIO,
6+
DsState, EncryptionContext, IOState, IOop, JobId, Message, RawReadResponse,
7+
ReconcileIO, RegionDefinitionStatus, RegionMetadata, Validation,
88
};
99
use crucible_common::{
1010
deadline_secs, verbose_timeout, x509::TLSContext, ExtentId,
@@ -126,6 +126,9 @@ pub(crate) struct DownstairsClient {
126126
/// estimate per-client backpressure to keep the 3x downstairs in sync.
127127
pub(crate) bytes_outstanding: u64,
128128

129+
/// Write bytes in this queue, used for global backpressure
130+
pub(crate) write_bytes_outstanding: BackpressureBytes,
131+
129132
/// UUID for this downstairs region
130133
///
131134
/// Unpopulated until provided by `Message::RegionInfo`
@@ -230,6 +233,7 @@ impl DownstairsClient {
230233
repair_info: None,
231234
io_state_count: ClientIOStateCount::new(),
232235
bytes_outstanding: 0,
236+
write_bytes_outstanding: BackpressureBytes::new(),
233237
connection_id: ConnectionId(0),
234238
client_delay_us,
235239
}
@@ -270,6 +274,7 @@ impl DownstairsClient {
270274
repair_info: None,
271275
io_state_count: ClientIOStateCount::new(),
272276
bytes_outstanding: 0,
277+
write_bytes_outstanding: BackpressureBytes::new(),
273278
connection_id: ConnectionId(0),
274279
client_delay_us,
275280
}
@@ -379,10 +384,12 @@ impl DownstairsClient {
379384
.bytes_outstanding
380385
.checked_sub(job.work.job_bytes())
381386
.unwrap();
387+
self.write_bytes_outstanding.decrement(job, self.client_id);
382388
} else if is_running && !was_running {
383389
// This should only happen if a job is replayed, but that still
384390
// counts!
385391
self.bytes_outstanding += job.work.job_bytes();
392+
self.write_bytes_outstanding.increment(job, self.client_id);
386393
}
387394

388395
old_state
@@ -934,6 +941,7 @@ impl DownstairsClient {
934941
};
935942
if r == IOState::New {
936943
self.bytes_outstanding += io.work.job_bytes();
944+
self.write_bytes_outstanding.increment(io, self.client_id);
937945
}
938946
self.io_state_count.incr(&r);
939947
r

upstairs/src/downstairs.rs

+26-78
Original file line numberDiff line numberDiff line change
@@ -73,17 +73,6 @@ pub(crate) struct Downstairs {
7373
/// The active list of IO for the downstairs.
7474
pub(crate) ds_active: ActiveJobs,
7575

76-
/// The number of write bytes that haven't finished yet
77-
///
78-
/// This is used to configure backpressure to the host, because writes
79-
/// (uniquely) will return before actually being completed by a Downstairs
80-
/// and can clog up the queues.
81-
///
82-
/// It is stored in the Downstairs because from the perspective of the
83-
/// Upstairs, writes complete immediately; only the Downstairs is actually
84-
/// tracking the pending jobs.
85-
write_bytes_outstanding: BackpressureBytes,
86-
8776
/// The next Job ID this Upstairs should use for downstairs work.
8877
next_id: JobId,
8978

@@ -139,44 +128,6 @@ pub(crate) struct Downstairs {
139128
reqwest_client: reqwest::Client,
140129
}
141130

142-
/// Helper struct to contain a count of backpressure bytes
143-
#[derive(Debug)]
144-
struct BackpressureBytes(u64);
145-
146-
impl BackpressureBytes {
147-
fn new() -> Self {
148-
BackpressureBytes(0)
149-
}
150-
151-
/// Ensures that the given `DownstairsIO` is counted for backpressure
152-
///
153-
/// This is idempotent: if the job has already been counted, indicated by
154-
/// the `DownstairsIO::backpressure_bytes` member being `Some(..)`, it will
155-
/// not be counted again.
156-
fn increment(&mut self, io: &mut DownstairsIO) {
157-
match &io.work {
158-
IOop::Write { data, .. } | IOop::WriteUnwritten { data, .. } => {
159-
if io.backpressure_bytes.is_none() {
160-
let n = data.len() as u64;
161-
io.backpressure_bytes = Some(n);
162-
self.0 += n;
163-
}
164-
}
165-
_ => (),
166-
};
167-
}
168-
169-
/// Remove the given job's contribution to backpressure
170-
///
171-
/// This is idempotent: `DownstairsIO::backpressure_bytes` is set to `None`
172-
/// by this function call, so it's harmless to call repeatedly.
173-
fn decrement(&mut self, io: &mut DownstairsIO) {
174-
if let Some(n) = io.backpressure_bytes.take() {
175-
self.0 = self.0.checked_sub(n).unwrap();
176-
}
177-
}
178-
}
179-
180131
/// State machine for a live-repair operation
181132
///
182133
/// We pass through all states (except `FinalFlush`) in order for each extent,
@@ -331,7 +282,6 @@ impl Downstairs {
331282
cfg,
332283
next_flush: 0,
333284
ds_active: ActiveJobs::new(),
334-
write_bytes_outstanding: BackpressureBytes::new(),
335285
completed: AllocRingBuffer::new(2048),
336286
completed_jobs: AllocRingBuffer::new(8),
337287
next_id: JobId(1000),
@@ -936,10 +886,6 @@ impl Downstairs {
936886
if self.clients[client_id].replay_job(job) {
937887
count += 1;
938888
}
939-
940-
// Make sure this job counts for backpressure (this is a no-op if
941-
// the job is already counted).
942-
self.write_bytes_outstanding.increment(job);
943889
});
944890
info!(
945891
self.log,
@@ -1519,7 +1465,7 @@ impl Downstairs {
15191465
replay: false,
15201466
data: None,
15211467
read_validations: Vec::new(),
1522-
backpressure_bytes: None,
1468+
backpressure_bytes: ClientMap::new(),
15231469
}
15241470
}
15251471

@@ -1643,7 +1589,7 @@ impl Downstairs {
16431589
replay: false,
16441590
data: None,
16451591
read_validations: Vec::new(),
1646-
backpressure_bytes: None,
1592+
backpressure_bytes: ClientMap::new(),
16471593
}
16481594
}
16491595

@@ -1784,7 +1730,7 @@ impl Downstairs {
17841730
replay: false,
17851731
data: None,
17861732
read_validations: Vec::new(),
1787-
backpressure_bytes: None,
1733+
backpressure_bytes: ClientMap::new(),
17881734
}
17891735
}
17901736

@@ -1842,7 +1788,7 @@ impl Downstairs {
18421788
replay: false,
18431789
data: None,
18441790
read_validations: Vec::new(),
1845-
backpressure_bytes: None,
1791+
backpressure_bytes: ClientMap::new(),
18461792
};
18471793
self.enqueue(io);
18481794
ds_id
@@ -1926,7 +1872,7 @@ impl Downstairs {
19261872
replay: false,
19271873
data: None,
19281874
read_validations: Vec::new(),
1929-
backpressure_bytes: None,
1875+
backpressure_bytes: ClientMap::new(),
19301876
};
19311877
self.enqueue(io);
19321878
ds_id
@@ -1957,7 +1903,7 @@ impl Downstairs {
19571903
replay: false,
19581904
data: None,
19591905
read_validations: Vec::new(),
1960-
backpressure_bytes: None,
1906+
backpressure_bytes: ClientMap::new(),
19611907
}
19621908
}
19631909

@@ -2366,7 +2312,7 @@ impl Downstairs {
23662312
replay: false,
23672313
data: None,
23682314
read_validations: Vec::new(),
2369-
backpressure_bytes: None,
2315+
backpressure_bytes: ClientMap::new(),
23702316
};
23712317

23722318
self.enqueue(fl);
@@ -2493,7 +2439,7 @@ impl Downstairs {
24932439
replay: false,
24942440
data: None,
24952441
read_validations: Vec::new(),
2496-
backpressure_bytes: None,
2442+
backpressure_bytes: ClientMap::new(),
24972443
};
24982444

24992445
self.enqueue(io);
@@ -2580,7 +2526,6 @@ impl Downstairs {
25802526
}
25812527

25822528
// Make sure this job is counted for backpressure
2583-
self.write_bytes_outstanding.increment(&mut io);
25842529
let is_write = matches!(io.work, IOop::Write { .. });
25852530

25862531
// Puts the IO onto the downstairs work queue.
@@ -2963,12 +2908,17 @@ impl Downstairs {
29632908
// they are completed (in `process_io_completion_inner`),
29642909
// **not** when they are retired. We'll do a sanity check here
29652910
// and print a warning if that's not the case.
2966-
if job.backpressure_bytes.is_some() {
2967-
warn!(
2968-
self.log,
2969-
"job {ds_id} had pending backpressure bytes"
2970-
);
2971-
self.write_bytes_outstanding.decrement(&mut job);
2911+
for c in ClientId::iter() {
2912+
if job.backpressure_bytes.contains(&c) {
2913+
warn!(
2914+
self.log,
2915+
"job {ds_id} had pending backpressure bytes \
2916+
for client {c}"
2917+
);
2918+
self.clients[c]
2919+
.write_bytes_outstanding
2920+
.decrement(&mut job, c);
2921+
}
29722922
}
29732923
}
29742924

@@ -3467,14 +3417,6 @@ impl Downstairs {
34673417
self.ackable_work.insert(ds_id);
34683418
}
34693419

3470-
// Write bytes no longer count for backpressure once all 3x downstairs
3471-
// have returned (although they'll continue to be stored until they are
3472-
// retired by the next flush).
3473-
let wc = job.state_count();
3474-
if (wc.error + wc.skipped + wc.done) == 3 {
3475-
self.write_bytes_outstanding.decrement(job);
3476-
}
3477-
34783420
/*
34793421
* If all 3 jobs are done, we can check here to see if we can
34803422
* remove this job from the DS list. If we have completed the ack
@@ -3492,6 +3434,7 @@ impl Downstairs {
34923434
// If we are a write or a flush with one success, then
34933435
// we must switch our state to failed. This condition is
34943436
// handled when we check the job result.
3437+
let wc = job.state_count();
34953438
if (wc.error + wc.skipped + wc.done) == 3 {
34963439
self.ackable_work.insert(ds_id);
34973440
debug!(self.log, "[{}] Set AckReady {}", client_id, job.ds_id);
@@ -3589,7 +3532,12 @@ impl Downstairs {
35893532
}
35903533

35913534
pub(crate) fn write_bytes_outstanding(&self) -> u64 {
3592-
self.write_bytes_outstanding.0
3535+
self.clients
3536+
.iter()
3537+
.filter(|c| matches!(c.state(), DsState::Active))
3538+
.map(|c| c.write_bytes_outstanding.get())
3539+
.max()
3540+
.unwrap_or(0)
35933541
}
35943542

35953543
/// Marks a single job as acked

upstairs/src/guest.rs

-49
Original file line numberDiff line numberDiff line change
@@ -1461,55 +1461,6 @@ mod test {
14611461
Ok(())
14621462
}
14631463

1464-
/// Confirm that the offline timeout is reasonable
1465-
#[test]
1466-
fn check_offline_timeout() {
1467-
for job_size in
1468-
[512, 4 * 1024, 16 * 1024, 64 * 1024, 256 * 1024, 1024 * 1024]
1469-
{
1470-
let mut bytes_in_flight = 0;
1471-
let mut jobs_in_flight = 0;
1472-
let mut time_usec: u64 = 0;
1473-
let cfg = Guest::default_backpressure_config();
1474-
1475-
let (t, desc) = loop {
1476-
let bp_usec =
1477-
cfg.get_backpressure_us(bytes_in_flight, jobs_in_flight);
1478-
time_usec = time_usec.saturating_add(bp_usec);
1479-
1480-
if bytes_in_flight >= IO_OUTSTANDING_MAX_BYTES {
1481-
break (time_usec, "bytes");
1482-
}
1483-
1484-
if jobs_in_flight >= IO_OUTSTANDING_MAX_JOBS as u64 {
1485-
break (time_usec, "jobs");
1486-
}
1487-
1488-
bytes_in_flight += job_size;
1489-
jobs_in_flight += 1;
1490-
};
1491-
1492-
let timeout = Duration::from_micros(t);
1493-
assert!(
1494-
timeout > Duration::from_secs(1),
1495-
"offline -> faulted transition happens too quickly \
1496-
with job size {job_size}; expected > 1 sec, got {}",
1497-
humantime::format_duration(timeout)
1498-
);
1499-
assert!(
1500-
timeout < Duration::from_secs(180),
1501-
"offline -> faulted transition happens too slowly \
1502-
with job size {job_size}; expected < 3 mins, got {}",
1503-
humantime::format_duration(timeout)
1504-
);
1505-
1506-
println!(
1507-
"job size {job_size:>8}:\n Timeout in {} ({desc})\n",
1508-
humantime::format_duration(timeout)
1509-
);
1510-
}
1511-
}
1512-
15131464
#[test]
15141465
fn check_max_backpressure() {
15151466
let cfg = Guest::default_backpressure_config();

0 commit comments

Comments
 (0)