Skip to content

Commit 30cdad6

Browse files
committed
Use FIOFFS instead of fsync (on supported platforms)
1 parent 9f340f2 commit 30cdad6

File tree

7 files changed

+311
-128
lines changed

7 files changed

+311
-128
lines changed

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ itertools = "0.12.1"
6161
libc = "0.2"
6262
mime_guess = "2.0.4"
6363
nbd = "0.2.3"
64-
nix = { version = "0.28", features = [ "feature", "uio" ] }
64+
nix = { version = "0.28", features = [ "feature", "uio", "ioctl" ] }
6565
num_enum = "0.7"
6666
num-derive = "0.4"
6767
num-traits = "0.2"

downstairs/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,4 @@ asm = ["usdt/asm"]
7171
default = []
7272
zfs_snapshot = []
7373
integration-tests = [] # Enables creating SQLite volumes
74+
omicron-build = [] # Uses FIOFSS for flushes instead of fsync

downstairs/src/extent.rs

+75-6
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,45 @@ pub(crate) trait ExtentInner: Send + Sync + Debug {
3434
fn flush_number(&self) -> Result<u64, CrucibleError>;
3535
fn dirty(&self) -> Result<bool, CrucibleError>;
3636

37-
fn flush(
37+
/// Performs any metadata updates needed before a flush
38+
fn pre_flush(
39+
&mut self,
40+
new_flush: u64,
41+
new_gen: u64,
42+
job_id: JobOrReconciliationId,
43+
) -> Result<(), CrucibleError>;
44+
45+
/// Syncs all relevant data to persistant storage
46+
fn flush_inner(
47+
&mut self,
48+
job_id: JobOrReconciliationId,
49+
) -> Result<(), CrucibleError>;
50+
51+
/// Performs any metadata updates after syncing data to persistent storage
52+
fn post_flush(
3853
&mut self,
3954
new_flush: u64,
4055
new_gen: u64,
4156
job_id: JobOrReconciliationId,
4257
) -> Result<(), CrucibleError>;
4358

59+
/// Performs a full flush (pre/inner/post)
60+
///
61+
/// This is only exposed for the sake of unit testing; normal code should
62+
/// use the fine-grained functions and be forced to consider performance.
63+
#[cfg(test)]
64+
fn flush(
65+
&mut self,
66+
new_flush: u64,
67+
new_gen: u64,
68+
job_id: JobOrReconciliationId,
69+
) -> Result<(), CrucibleError> {
70+
self.pre_flush(new_flush, new_gen, job_id)?;
71+
self.flush_inner(job_id)?;
72+
self.post_flush(new_flush, new_gen, job_id)?;
73+
Ok(())
74+
}
75+
4476
fn read(
4577
&mut self,
4678
job_id: JobId,
@@ -579,22 +611,25 @@ impl Extent {
579611
Ok(())
580612
}
581613

582-
#[instrument]
583-
pub(crate) async fn flush<I: Into<JobOrReconciliationId> + Debug>(
614+
/// Prepares for a flush
615+
///
616+
/// Returns `false` if we should skip the flush (because this extent is not
617+
/// dirty), or `true` if we should proceed.
618+
pub(crate) fn pre_flush<I: Into<JobOrReconciliationId> + Debug>(
584619
&mut self,
585620
new_flush: u64,
586621
new_gen: u64,
587622
id: I, // only used for logging
588623
log: &Logger,
589-
) -> Result<(), CrucibleError> {
624+
) -> Result<bool, CrucibleError> {
590625
let job_id: JobOrReconciliationId = id.into();
591626

592627
if !self.inner.dirty()? {
593628
/*
594629
* If we have made no writes to this extent since the last flush,
595630
* we do not need to update the extent on disk
596631
*/
597-
return Ok(());
632+
return Ok(false);
598633
}
599634

600635
// Read only extents should never have the dirty bit set. If they do,
@@ -605,7 +640,41 @@ impl Extent {
605640
crucible_bail!(ModifyingReadOnlyRegion);
606641
}
607642

608-
self.inner.flush(new_flush, new_gen, job_id)
643+
self.inner.pre_flush(new_flush, new_gen, job_id)?;
644+
Ok(true)
645+
}
646+
647+
/// Prepares for a flush
648+
///
649+
/// Returns `false` if we should skip the flush (because this extent is not
650+
/// dirty), or `true` if we should proceed.
651+
pub(crate) fn post_flush<I: Into<JobOrReconciliationId> + Debug>(
652+
&mut self,
653+
new_flush: u64,
654+
new_gen: u64,
655+
id: I, // only used for logging
656+
) -> Result<(), CrucibleError> {
657+
let job_id: JobOrReconciliationId = id.into();
658+
self.inner.post_flush(new_flush, new_gen, job_id)
659+
}
660+
661+
/// Flushes this extent if it is dirty
662+
#[instrument]
663+
pub(crate) async fn flush<
664+
I: Into<JobOrReconciliationId> + Debug + Copy + Clone,
665+
>(
666+
&mut self,
667+
new_flush: u64,
668+
new_gen: u64,
669+
id: I, // only used for logging
670+
log: &Logger,
671+
) -> Result<(), CrucibleError> {
672+
if !self.pre_flush(new_flush, new_gen, id, log)? {
673+
return Ok(());
674+
}
675+
self.inner.flush_inner(id.into())?;
676+
self.post_flush(new_flush, new_gen, id)?;
677+
Ok(())
609678
}
610679

611680
#[allow(clippy::unused_async)] // this will be async again in the future

downstairs/src/extent_inner_raw.rs

+20-16
Original file line numberDiff line numberDiff line change
@@ -246,20 +246,12 @@ impl ExtentInner for RawInner {
246246
Ok(ExtentReadResponse { data: buf, blocks })
247247
}
248248

249-
fn flush(
249+
fn pre_flush(
250250
&mut self,
251251
new_flush: u64,
252252
new_gen: u64,
253253
job_id: JobOrReconciliationId,
254254
) -> Result<(), CrucibleError> {
255-
if !self.dirty()? {
256-
/*
257-
* If we have made no writes to this extent since the last flush,
258-
* we do not need to update the extent on disk
259-
*/
260-
return Ok(());
261-
}
262-
263255
cdt::extent__flush__start!(|| {
264256
(job_id.get(), self.extent_number, 0)
265257
});
@@ -268,11 +260,17 @@ impl ExtentInner for RawInner {
268260
// operation atomic.
269261
self.set_flush_number(new_flush, new_gen)?;
270262

263+
Ok(())
264+
}
265+
266+
fn flush_inner(
267+
&mut self,
268+
job_id: JobOrReconciliationId,
269+
) -> Result<(), CrucibleError> {
271270
// Now, we fsync to ensure data is flushed to disk. It's okay to crash
272271
// before this point, because setting the flush number is atomic.
273-
cdt::extent__flush__file__start!(|| {
274-
(job_id.get(), self.extent_number, 0)
275-
});
272+
cdt::extent__flush__file__start!(|| (job_id.get(), self.extent_number));
273+
276274
if let Err(e) = self.file.sync_all() {
277275
/*
278276
* XXX Retry? Mark extent as broken?
@@ -283,10 +281,16 @@ impl ExtentInner for RawInner {
283281
)));
284282
}
285283
self.context_slot_dirty.fill(0);
286-
cdt::extent__flush__file__done!(|| {
287-
(job_id.get(), self.extent_number, 0)
288-
});
284+
cdt::extent__flush__file__done!(|| (job_id.get(), self.extent_number));
285+
Ok(())
286+
}
289287

288+
fn post_flush(
289+
&mut self,
290+
_new_flush: u64,
291+
_new_gen: u64,
292+
job_id: JobOrReconciliationId,
293+
) -> Result<(), CrucibleError> {
290294
// Check for fragmentation in the context slots leading to worse
291295
// performance, and defragment if that's the case.
292296
let extra_syscalls_per_rw = self
@@ -301,7 +305,7 @@ impl ExtentInner for RawInner {
301305
Ok(())
302306
};
303307

304-
cdt::extent__flush__done!(|| { (job_id.get(), self.extent_number, 0) });
308+
cdt::extent__flush__done!(|| (job_id.get(), self.extent_number));
305309

306310
r
307311
}

downstairs/src/extent_inner_sqlite.rs

+55-23
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,32 @@ impl ExtentInner for SqliteInner {
3333
self.0.lock().unwrap().dirty()
3434
}
3535

36-
fn flush(
36+
fn pre_flush(
3737
&mut self,
3838
new_flush: u64,
3939
new_gen: u64,
4040
job_id: JobOrReconciliationId,
4141
) -> Result<(), CrucibleError> {
42-
self.0.lock().unwrap().flush(new_flush, new_gen, job_id)
42+
self.0.lock().unwrap().pre_flush(new_flush, new_gen, job_id)
43+
}
44+
45+
fn flush_inner(
46+
&mut self,
47+
job_id: JobOrReconciliationId,
48+
) -> Result<(), CrucibleError> {
49+
self.0.lock().unwrap().flush_inner(job_id)
50+
}
51+
52+
fn post_flush(
53+
&mut self,
54+
new_flush: u64,
55+
new_gen: u64,
56+
job_id: JobOrReconciliationId,
57+
) -> Result<(), CrucibleError> {
58+
self.0
59+
.lock()
60+
.unwrap()
61+
.post_flush(new_flush, new_gen, job_id)
4362
}
4463

4564
fn read(
@@ -191,10 +210,10 @@ impl SqliteMoreInner {
191210
Ok(self.dirty.get())
192211
}
193212

194-
fn flush(
213+
fn pre_flush(
195214
&mut self,
196-
new_flush: u64,
197-
new_gen: u64,
215+
_new_flush: u64,
216+
_new_gen: u64,
198217
job_id: JobOrReconciliationId,
199218
) -> Result<(), CrucibleError> {
200219
// Used for profiling
@@ -204,13 +223,18 @@ impl SqliteMoreInner {
204223
(job_id.get(), self.extent_number, n_dirty_blocks)
205224
});
206225

226+
Ok(())
227+
}
228+
229+
fn flush_inner(
230+
&mut self,
231+
job_id: JobOrReconciliationId,
232+
) -> Result<(), CrucibleError> {
207233
/*
208234
* We must first fsync to get any outstanding data written to disk.
209235
* This must be done before we update the flush number.
210236
*/
211-
cdt::extent__flush__file__start!(|| {
212-
(job_id.get(), self.extent_number, n_dirty_blocks)
213-
});
237+
cdt::extent__flush__file__start!(|| (job_id.get(), self.extent_number));
214238
if let Err(e) = self.file.sync_all() {
215239
/*
216240
* XXX Retry? Mark extent as broken?
@@ -221,10 +245,17 @@ impl SqliteMoreInner {
221245
self.extent_number,
222246
);
223247
}
224-
cdt::extent__flush__file__done!(|| {
225-
(job_id.get(), self.extent_number, n_dirty_blocks)
226-
});
248+
cdt::extent__flush__file__done!(|| (job_id.get(), self.extent_number));
227249

250+
Ok(())
251+
}
252+
253+
fn post_flush(
254+
&mut self,
255+
new_flush: u64,
256+
new_gen: u64,
257+
job_id: JobOrReconciliationId,
258+
) -> Result<(), CrucibleError> {
228259
// Clear old block contexts. In order to be crash consistent, only
229260
// perform this after the extent fsync is done. For each block
230261
// written since the last flush, remove all block context rows where
@@ -233,9 +264,10 @@ impl SqliteMoreInner {
233264
// values were written. When the region is first opened, the entire
234265
// file is rehashed, since in that case we don't have that luxury.
235266

236-
cdt::extent__flush__collect__hashes__start!(|| {
237-
(job_id.get(), self.extent_number, n_dirty_blocks)
238-
});
267+
cdt::extent__flush__collect__hashes__start!(|| (
268+
job_id.get(),
269+
self.extent_number
270+
));
239271

240272
// Rehash any parts of the file that we *may have written* data to since
241273
// the last flush. (If we know that we wrote the data, then we don't
@@ -246,9 +278,10 @@ impl SqliteMoreInner {
246278
(job_id.get(), self.extent_number, n_rehashed as u64)
247279
});
248280

249-
cdt::extent__flush__sqlite__insert__start!(|| {
250-
(job_id.get(), self.extent_number, n_dirty_blocks)
251-
});
281+
cdt::extent__flush__sqlite__insert__start!(|| (
282+
job_id.get(),
283+
self.extent_number
284+
));
252285

253286
// We put all of our metadb updates into a single transaction to
254287
// assure that we have a single sync.
@@ -261,9 +294,10 @@ impl SqliteMoreInner {
261294
&tx,
262295
)?;
263296

264-
cdt::extent__flush__sqlite__insert__done!(|| {
265-
(job_id.get(), self.extent_number, n_dirty_blocks)
266-
});
297+
cdt::extent__flush__sqlite__insert__done!(|| (
298+
job_id.get(),
299+
self.extent_number
300+
));
267301

268302
self.set_flush_number(new_flush, new_gen)?;
269303
tx.commit()?;
@@ -272,9 +306,7 @@ impl SqliteMoreInner {
272306
// Finally, reset the file's seek offset to 0
273307
self.file.seek(SeekFrom::Start(0))?;
274308

275-
cdt::extent__flush__done!(|| {
276-
(job_id.get(), self.extent_number, n_dirty_blocks)
277-
});
309+
cdt::extent__flush__done!(|| (job_id.get(), self.extent_number));
278310
Ok(())
279311
}
280312

0 commit comments

Comments
 (0)