Skip to content

Commit 1c1574f

Browse files
authored
Per client, queue-based backpressure (#1186)
This PR adds per-client delays to prevent queues from becoming arbitrarily long during read-heavy workflows. These delays are based either per-client queue lengths and bytes in flight (whichever is worst compared to the slowest client), which is analogous to the guest backpressure implementation. Compared to main, this keeps the queues relatively bounded, and is at least similar in performance (hard to tell, because there significant run-to-run variability). Fixes #1167, and is an alternative to #1181
1 parent fbe6f12 commit 1c1574f

File tree

5 files changed

+268
-6
lines changed

5 files changed

+268
-6
lines changed

upstairs/src/client.rs

+94-6
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,22 @@ use crate::{
1010
use crucible_common::x509::TLSContext;
1111
use crucible_protocol::{ReconciliationId, CRUCIBLE_MESSAGE_VERSION};
1212

13-
use std::{collections::BTreeSet, net::SocketAddr, sync::Arc};
13+
use std::{
14+
collections::BTreeSet,
15+
net::SocketAddr,
16+
sync::{
17+
atomic::{AtomicU64, Ordering},
18+
Arc,
19+
},
20+
};
1421

1522
use futures::StreamExt;
1623
use slog::{debug, error, info, o, warn, Logger};
1724
use tokio::{
1825
io::AsyncWriteExt,
1926
net::{TcpSocket, TcpStream},
2027
sync::{mpsc, oneshot},
21-
time::sleep_until,
28+
time::{sleep_until, Duration},
2229
};
2330
use tokio_util::codec::{Encoder, FramedRead};
2431
use uuid::Uuid;
@@ -154,6 +161,12 @@ pub(crate) struct DownstairsClient {
154161
/// IO state counters
155162
pub(crate) io_state_count: ClientIOStateCount,
156163

164+
/// Bytes in queues for this client
165+
///
166+
/// This includes read, write, and write-unwritten jobs, and is used to
167+
/// estimate per-client backpressure to keep the 3x downstairs in sync.
168+
pub(crate) bytes_outstanding: u64,
169+
157170
/// UUID for this downstairs region
158171
///
159172
/// Unpopulated until provided by `Message::RegionInfo`
@@ -216,6 +229,9 @@ pub(crate) struct DownstairsClient {
216229

217230
/// Session ID for a clients connection to a downstairs.
218231
connection_id: ConnectionId,
232+
233+
/// Per-client delay, shared with the [`DownstairsClient`]
234+
client_delay_us: Arc<AtomicU64>,
219235
}
220236

221237
impl DownstairsClient {
@@ -226,6 +242,7 @@ impl DownstairsClient {
226242
log: Logger,
227243
tls_context: Option<Arc<crucible_common::x509::TLSContext>>,
228244
) -> Self {
245+
let client_delay_us = Arc::new(AtomicU64::new(0));
229246
Self {
230247
cfg,
231248
client_task: Self::new_io_task(
@@ -234,6 +251,7 @@ impl DownstairsClient {
234251
false, // do not start the task until GoActive
235252
client_id,
236253
tls_context.clone(),
254+
client_delay_us.clone(),
237255
&log,
238256
),
239257
client_id,
@@ -252,7 +270,9 @@ impl DownstairsClient {
252270
region_metadata: None,
253271
repair_info: None,
254272
io_state_count: ClientIOStateCount::new(),
273+
bytes_outstanding: 0,
255274
connection_id: ConnectionId(0),
275+
client_delay_us,
256276
}
257277
}
258278

@@ -262,6 +282,7 @@ impl DownstairsClient {
262282
/// client will disappear into the void.
263283
#[cfg(test)]
264284
fn test_default() -> Self {
285+
let client_delay_us = Arc::new(AtomicU64::new(0));
265286
let cfg = Arc::new(UpstairsConfig {
266287
encryption_context: None,
267288
upstairs_id: Uuid::new_v4(),
@@ -289,7 +310,9 @@ impl DownstairsClient {
289310
region_metadata: None,
290311
repair_info: None,
291312
io_state_count: ClientIOStateCount::new(),
313+
bytes_outstanding: 0,
292314
connection_id: ConnectionId(0),
315+
client_delay_us,
293316
}
294317
}
295318

@@ -407,9 +430,26 @@ impl DownstairsClient {
407430
job: &mut DownstairsIO,
408431
new_state: IOState,
409432
) -> IOState {
433+
let is_running =
434+
matches!(new_state, IOState::New | IOState::InProgress);
410435
self.io_state_count.incr(&new_state);
411436
let old_state = job.state.insert(self.client_id, new_state);
437+
let was_running =
438+
matches!(old_state, IOState::New | IOState::InProgress);
412439
self.io_state_count.decr(&old_state);
440+
441+
// Update our bytes-in-flight counter
442+
if was_running && !is_running {
443+
self.bytes_outstanding = self
444+
.bytes_outstanding
445+
.checked_sub(job.work.job_bytes())
446+
.unwrap();
447+
} else if is_running && !was_running {
448+
// This should only happen if a job is replayed, but that still
449+
// counts!
450+
self.bytes_outstanding += job.work.job_bytes();
451+
}
452+
413453
old_state
414454
}
415455

@@ -471,8 +511,12 @@ impl DownstairsClient {
471511
}
472512

473513
/// Sets this job as skipped and moves it to `skipped_jobs`
514+
///
515+
/// # Panics
516+
/// If the job is not new or in-progress
474517
pub(crate) fn skip_job(&mut self, job: &mut DownstairsIO) {
475-
self.set_job_state(job, IOState::Skipped);
518+
let prev_state = self.set_job_state(job, IOState::Skipped);
519+
assert!(matches!(prev_state, IOState::New | IOState::InProgress));
476520
self.skipped_jobs.insert(job.ds_id);
477521
}
478522

@@ -628,6 +672,7 @@ impl DownstairsClient {
628672
}
629673

630674
self.connection_id.update();
675+
631676
// Restart with a short delay
632677
self.start_task(true, auto_promote);
633678
}
@@ -652,6 +697,7 @@ impl DownstairsClient {
652697
connect,
653698
self.client_id,
654699
self.tls_context.clone(),
700+
self.client_delay_us.clone(),
655701
&self.log,
656702
);
657703
}
@@ -662,6 +708,7 @@ impl DownstairsClient {
662708
connect: bool,
663709
client_id: ClientId,
664710
tls_context: Option<Arc<TLSContext>>,
711+
client_delay_us: Arc<AtomicU64>,
665712
log: &Logger,
666713
) -> ClientTaskHandle {
667714
#[cfg(test)]
@@ -672,6 +719,7 @@ impl DownstairsClient {
672719
connect,
673720
client_id,
674721
tls_context,
722+
client_delay_us,
675723
log,
676724
)
677725
} else {
@@ -685,6 +733,7 @@ impl DownstairsClient {
685733
connect,
686734
client_id,
687735
tls_context,
736+
client_delay_us,
688737
log,
689738
)
690739
}
@@ -695,6 +744,7 @@ impl DownstairsClient {
695744
connect: bool,
696745
client_id: ClientId,
697746
tls_context: Option<Arc<TLSContext>>,
747+
client_delay_us: Arc<AtomicU64>,
698748
log: &Logger,
699749
) -> ClientTaskHandle {
700750
// These channels must support at least MAX_ACTIVE_COUNT messages;
@@ -730,6 +780,7 @@ impl DownstairsClient {
730780
log: log.clone(),
731781
},
732782
delay,
783+
client_delay_us,
733784
log,
734785
};
735786
c.run().await
@@ -945,6 +996,9 @@ impl DownstairsClient {
945996
IOState::New
946997
}
947998
};
999+
if r == IOState::New {
1000+
self.bytes_outstanding += io.work.job_bytes();
1001+
}
9481002
self.io_state_count.incr(&r);
9491003
r
9501004
}
@@ -1261,9 +1315,8 @@ impl DownstairsClient {
12611315
}
12621316
};
12631317

1264-
let old_state = job.state.insert(self.client_id, new_state.clone());
1265-
self.io_state_count.decr(&old_state);
1266-
self.io_state_count.incr(&new_state);
1318+
// Update the state, maintaining various counters
1319+
let old_state = self.set_job_state(job, new_state.clone());
12671320

12681321
/*
12691322
* Verify the job was InProgress
@@ -2209,6 +2262,10 @@ impl DownstairsClient {
22092262
(self.io_state_count.new + self.io_state_count.in_progress) as usize
22102263
}
22112264

2265+
pub(crate) fn total_bytes_outstanding(&self) -> usize {
2266+
self.bytes_outstanding as usize
2267+
}
2268+
22122269
/// Returns a unique ID for the current connection, or `None`
22132270
///
22142271
/// This can be used to disambiguate between messages returned from
@@ -2220,6 +2277,16 @@ impl DownstairsClient {
22202277
None
22212278
}
22222279
}
2280+
2281+
/// Sets the per-client delay
2282+
pub(crate) fn set_delay_us(&self, delay: u64) {
2283+
self.client_delay_us.store(delay, Ordering::Relaxed);
2284+
}
2285+
2286+
/// Looks up the per-client delay
2287+
pub(crate) fn get_delay_us(&self) -> u64 {
2288+
self.client_delay_us.load(Ordering::Relaxed)
2289+
}
22232290
}
22242291

22252292
/// How to handle "promote to active" requests
@@ -2420,6 +2487,9 @@ struct ClientIoTask {
24202487
/// Handle for the rx task
24212488
recv_task: ClientRxTask,
24222489

2490+
/// Shared handle to receive per-client backpressure delay
2491+
client_delay_us: Arc<AtomicU64>,
2492+
24232493
log: Logger,
24242494
}
24252495

@@ -2692,6 +2762,24 @@ impl ClientIoTask {
26922762
+ std::marker::Send
26932763
+ 'static,
26942764
{
2765+
// Delay communication with this client based on backpressure, to keep
2766+
// the three clients relatively in sync with each other.
2767+
//
2768+
// We don't need to delay writes, because they're already constrained by
2769+
// the global backpressure system and cannot build up an unbounded
2770+
// queue. This is admittedly quite subtle; see crucible#1167 for
2771+
// discussions and graphs.
2772+
if !matches!(
2773+
m,
2774+
ClientRequest::Message(Message::Write { .. })
2775+
| ClientRequest::RawMessage(RawMessage::Write { .. }, ..)
2776+
) {
2777+
let d = self.client_delay_us.load(Ordering::Relaxed);
2778+
if d > 0 {
2779+
tokio::time::sleep(Duration::from_micros(d)).await;
2780+
}
2781+
}
2782+
26952783
// There's some duplication between this function and `cmd_loop` above,
26962784
// but it's not obvious whether there's a cleaner way to organize stuff.
26972785
tokio::select! {

0 commit comments

Comments
 (0)