Skip to content

Commit ec40e2f

Browse files
committed
Use FIOFFS on illumos instead of fsync
illumos supports sending a FIOFFS ioctl to a ZFS dataset in order to sync _all_ outstanding IO. Use this ioctl in `region_flush` for the whole region dataset instead of calling `sync_all` for each extent file. This is hidden behind a new `omicron-build` feature, which is true for our production builds. This necessitated separating flush (which commits bits to durable storage and is a no-op if `omicron-build` is used) and `post_flush` (which performs clean up or any other kind of accounting). Splitting this up exposed a bug in the sqlite implementation of ExtentInner: it wasn't checking to see if the extent was dirty and wasn't calling `set_flush_number` before calling `sync_all` on the user data file.
1 parent 147ae59 commit ec40e2f

File tree

5 files changed

+150
-40
lines changed

5 files changed

+150
-40
lines changed

downstairs/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,4 @@ asm = ["usdt/asm"]
7171
default = []
7272
zfs_snapshot = []
7373
integration-tests = [] # Enables creating SQLite volumes
74+
omicron-build = []

downstairs/src/extent.rs

+21
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@ pub(crate) trait ExtentInner: Send + Debug {
4242
job_id: JobOrReconciliationId,
4343
) -> Result<(), CrucibleError>;
4444

45+
/// After both user data and any extent metadata have been flushed to
46+
/// durable storage, perform accounting or clean-up.
47+
fn post_flush(
48+
&mut self,
49+
new_flush: u64,
50+
new_gen: u64,
51+
job_id: JobOrReconciliationId,
52+
) -> Result<(), CrucibleError>;
53+
4554
fn read(
4655
&mut self,
4756
job_id: JobId,
@@ -590,6 +599,18 @@ impl Extent {
590599
inner.flush(new_flush, new_gen, job_id)
591600
}
592601

602+
#[instrument]
603+
pub(crate) async fn post_flush<I: Into<JobOrReconciliationId> + Debug>(
604+
&self,
605+
new_flush: u64,
606+
new_gen: u64,
607+
id: I, // only used for logging
608+
) -> Result<(), CrucibleError> {
609+
let job_id = id.into();
610+
let mut inner = self.inner.lock().await;
611+
inner.post_flush(new_flush, new_gen, job_id)
612+
}
613+
593614
pub async fn get_meta_info(&self) -> ExtentMeta {
594615
let inner = self.inner.lock().await;
595616
ExtentMeta {

downstairs/src/extent_inner_raw.rs

+46-22
Original file line numberDiff line numberDiff line change
@@ -325,24 +325,42 @@ impl ExtentInner for RawInner {
325325
// operation atomic.
326326
self.set_flush_number(new_flush, new_gen)?;
327327

328-
// Now, we fsync to ensure data is flushed to disk. It's okay to crash
329-
// before this point, because setting the flush number is atomic.
330-
cdt::extent__flush__file__start!(|| {
331-
(job_id.get(), self.extent_number, 0)
332-
});
333-
if let Err(e) = self.file.sync_all() {
334-
/*
335-
* XXX Retry? Mark extent as broken?
336-
*/
337-
return Err(CrucibleError::IoError(format!(
338-
"extent {}: fsync 1 failure: {e:?}",
339-
self.extent_number,
340-
)));
328+
if cfg!(feature = "omicron-build") {
329+
// no-op, FIOFFS will be called in region_flush
330+
} else {
331+
// Now, we fsync to ensure data is flushed to disk. It's okay to crash
332+
// before this point, because setting the flush number is atomic.
333+
cdt::extent__flush__file__start!(|| {
334+
(job_id.get(), self.extent_number, 0)
335+
});
336+
337+
if let Err(e) = self.file.sync_all() {
338+
/*
339+
* XXX Retry? Mark extent as broken?
340+
*/
341+
return Err(CrucibleError::IoError(format!(
342+
"extent {}: fsync 1 failure: {e:?}",
343+
self.extent_number,
344+
)));
345+
}
346+
347+
cdt::extent__flush__file__done!(|| {
348+
(job_id.get(), self.extent_number, 0)
349+
});
341350
}
351+
352+
cdt::extent__flush__done!(|| { (job_id.get(), self.extent_number, 0) });
353+
354+
Ok(())
355+
}
356+
357+
fn post_flush(
358+
&mut self,
359+
_new_flush: u64,
360+
_new_gen: u64,
361+
_job_id: JobOrReconciliationId,
362+
) -> Result<(), CrucibleError> {
342363
self.context_slot_dirty.fill(0);
343-
cdt::extent__flush__file__done!(|| {
344-
(job_id.get(), self.extent_number, 0)
345-
});
346364

347365
// Check for fragmentation in the context slots leading to worse
348366
// performance, and defragment if that's the case.
@@ -352,15 +370,11 @@ impl ExtentInner for RawInner {
352370
.unwrap_or(0);
353371
self.extra_syscall_count = 0;
354372
self.extra_syscall_denominator = 0;
355-
let r = if extra_syscalls_per_rw > DEFRAGMENT_THRESHOLD {
373+
if extra_syscalls_per_rw > DEFRAGMENT_THRESHOLD {
356374
self.defragment()
357375
} else {
358376
Ok(())
359-
};
360-
361-
cdt::extent__flush__done!(|| { (job_id.get(), self.extent_number, 0) });
362-
363-
r
377+
}
364378
}
365379

366380
#[cfg(test)]
@@ -1852,6 +1866,7 @@ mod test {
18521866

18531867
// Flush! This will mark all slots as synched
18541868
inner.flush(12, 12, JobId(11).into())?;
1869+
inner.post_flush(12, 12, JobId(11).into())?;
18551870
assert_eq!(inner.active_context[0], ContextSlot::B);
18561871
assert_eq!(inner.context_slot_dirty[0], 0b00);
18571872

@@ -1904,6 +1919,7 @@ mod test {
19041919

19051920
// Flush! This will mark all slots as synched
19061921
inner.flush(12, 12, JobId(11).into())?;
1922+
inner.post_flush(12, 12, JobId(11).into())?;
19071923
assert_eq!(inner.active_context[0], ContextSlot::A);
19081924
assert_eq!(inner.context_slot_dirty[0], 0b00);
19091925

@@ -2029,6 +2045,7 @@ mod test {
20292045
assert_eq!(inner.extra_syscall_count, 0);
20302046
assert_eq!(inner.extra_syscall_denominator, 5);
20312047
inner.flush(10, 10, JobId(10).into())?;
2048+
inner.post_flush(10, 10, JobId(10).into())?;
20322049
assert!(inner.context_slot_dirty.iter().all(|v| *v == 0));
20332050

20342051
// This should not have changed active context slots!
@@ -2080,6 +2097,7 @@ mod test {
20802097
// trigger defragmentation; after the flush, every context slot should
20812098
// be in array A.
20822099
inner.flush(11, 11, JobId(11).into())?;
2100+
inner.post_flush(11, 11, JobId(11).into())?;
20832101

20842102
for i in 0..10 {
20852103
assert_eq!(
@@ -2132,6 +2150,7 @@ mod test {
21322150
assert_eq!(inner.extra_syscall_count, 0);
21332151
assert_eq!(inner.extra_syscall_denominator, 3);
21342152
inner.flush(10, 10, JobId(10).into())?;
2153+
inner.post_flush(10, 10, JobId(10).into())?;
21352154

21362155
// This should not have changed active context slots!
21372156
for i in 0..10 {
@@ -2185,6 +2204,7 @@ mod test {
21852204
// trigger defragmentation; after the flush, every context slot should
21862205
// be in array B (which minimizes copies)
21872206
inner.flush(11, 11, JobId(11).into())?;
2207+
inner.post_flush(11, 11, JobId(11).into())?;
21882208

21892209
for i in 0..10 {
21902210
assert_eq!(
@@ -2239,6 +2259,7 @@ mod test {
22392259
assert_eq!(inner.extra_syscall_count, 0);
22402260
assert_eq!(inner.extra_syscall_denominator, 3);
22412261
inner.flush(10, 10, JobId(10).into())?;
2262+
inner.post_flush(10, 10, JobId(10).into())?;
22422263

22432264
// This should not have changed active context slots!
22442265
for i in 0..10 {
@@ -2293,6 +2314,7 @@ mod test {
22932314
// trigger defragmentation; after the flush, every context slot should
22942315
// be in array A (which minimizes copies)
22952316
inner.flush(11, 11, JobId(11).into())?;
2317+
inner.post_flush(11, 11, JobId(11).into())?;
22962318

22972319
for i in 0..10 {
22982320
assert_eq!(
@@ -2347,6 +2369,7 @@ mod test {
23472369
assert_eq!(inner.extra_syscall_count, 0);
23482370
assert_eq!(inner.extra_syscall_denominator, 2);
23492371
inner.flush(10, 10, JobId(10).into())?;
2372+
inner.post_flush(10, 10, JobId(10).into())?;
23502373

23512374
// This should not have changed active context slots!
23522375
for i in 0..10 {
@@ -2400,6 +2423,7 @@ mod test {
24002423
// This write didn't add enough extra syscalls to trigger
24012424
// defragmentation.
24022425
inner.flush(11, 11, JobId(11).into())?;
2426+
inner.post_flush(11, 11, JobId(11).into())?;
24032427

24042428
// These should be the same!
24052429
for i in 0..10 {

downstairs/src/extent_inner_sqlite.rs

+47-17
Original file line numberDiff line numberDiff line change
@@ -75,33 +75,63 @@ impl ExtentInner for SqliteInner {
7575
new_gen: u64,
7676
job_id: JobOrReconciliationId,
7777
) -> Result<(), CrucibleError> {
78+
if !self.dirty()? {
79+
/*
80+
* If we have made no writes to this extent since the last flush,
81+
* we do not need to update the extent on disk
82+
*/
83+
return Ok(());
84+
}
85+
86+
// We put all of our metadata updates into a single write to make this
87+
// operation atomic.
88+
self.set_flush_number(new_flush, new_gen)?;
89+
7890
// Used for profiling
7991
let n_dirty_blocks = self.dirty_blocks.len() as u64;
8092

8193
cdt::extent__flush__start!(|| {
8294
(job_id.get(), self.extent_number, n_dirty_blocks)
8395
});
8496

85-
/*
86-
* We must first fsync to get any outstanding data written to disk.
87-
* This must be done before we update the flush number.
88-
*/
89-
cdt::extent__flush__file__start!(|| {
90-
(job_id.get(), self.extent_number, n_dirty_blocks)
91-
});
92-
if let Err(e) = self.file.sync_all() {
97+
if cfg!(feature = "omicron-build") {
98+
// no-op, FIOFFS will be called in region_flush
99+
} else {
93100
/*
94-
* XXX Retry? Mark extent as broken?
101+
* We must first fsync to get any outstanding data written to disk.
102+
* This must be done before we update the flush number.
95103
*/
96-
crucible_bail!(
97-
IoError,
98-
"extent {}: fsync 1 failure: {e:?}",
99-
self.extent_number,
100-
);
104+
cdt::extent__flush__file__start!(|| {
105+
(job_id.get(), self.extent_number, n_dirty_blocks)
106+
});
107+
108+
if let Err(e) = self.file.sync_all() {
109+
/*
110+
* XXX Retry? Mark extent as broken?
111+
*/
112+
crucible_bail!(
113+
IoError,
114+
"extent {}: fsync 1 failure: {e:?}",
115+
self.extent_number,
116+
);
117+
}
118+
119+
cdt::extent__flush__file__done!(|| {
120+
(job_id.get(), self.extent_number, n_dirty_blocks)
121+
});
101122
}
102-
cdt::extent__flush__file__done!(|| {
103-
(job_id.get(), self.extent_number, n_dirty_blocks)
104-
});
123+
124+
Ok(())
125+
}
126+
127+
fn post_flush(
128+
&mut self,
129+
new_flush: u64,
130+
new_gen: u64,
131+
job_id: JobOrReconciliationId,
132+
) -> Result<(), CrucibleError> {
133+
// Used for profiling
134+
let n_dirty_blocks = self.dirty_blocks.len() as u64;
105135

106136
// Clear old block contexts. In order to be crash consistent, only
107137
// perform this after the extent fsync is done. For each block

downstairs/src/region.rs

+35-1
Original file line numberDiff line numberDiff line change
@@ -807,7 +807,7 @@ impl Region {
807807
*/
808808
#[instrument]
809809
pub(crate) async fn region_flush_extent<
810-
I: Into<JobOrReconciliationId> + Debug,
810+
I: Into<JobOrReconciliationId> + Debug + Copy,
811811
>(
812812
&self,
813813
eid: usize,
@@ -824,10 +824,13 @@ impl Region {
824824
);
825825

826826
let extent = self.get_opened_extent(eid).await;
827+
827828
extent
828829
.flush(flush_number, gen_number, job_id, &self.log)
829830
.await?;
830831

832+
extent.post_flush(flush_number, gen_number, job_id).await?;
833+
831834
Ok(())
832835
}
833836

@@ -905,6 +908,37 @@ impl Region {
905908
result??;
906909
}
907910

911+
if cfg!(feature = "omicron-build") {
912+
use std::io;
913+
use std::os::fd::AsRawFd;
914+
use std::ptr;
915+
916+
// "file system flush", defined in illumos' sys/filio.h
917+
const FIOFFS: libc::c_int = 0x20000000 | (0x66 /*'f'*/ << 8) | 0x42 /*66*/;
918+
919+
// Open the region's mountpoint
920+
let file = File::open(&self.dir)?;
921+
922+
let rc = unsafe {
923+
libc::ioctl(
924+
file.as_raw_fd(),
925+
FIOFFS as _,
926+
ptr::null_mut::<i32>(),
927+
)
928+
};
929+
930+
if rc != 0 {
931+
return Err(io::Error::last_os_error().into());
932+
}
933+
}
934+
935+
// After the bits have been committed to durable storage, execute any post flush routine
936+
// that needs to happen
937+
for eid in &dirty_extents {
938+
let extent = self.get_opened_extent(*eid).await;
939+
extent.post_flush(flush_number, gen_number, job_id).await?;
940+
}
941+
908942
// Now everything has succeeded, we can remove these extents from the
909943
// flush candidates
910944
match extent_limit {

0 commit comments

Comments
 (0)