diff --git a/downstairs/Cargo.toml b/downstairs/Cargo.toml index 2cc93866e..ca92474d2 100644 --- a/downstairs/Cargo.toml +++ b/downstairs/Cargo.toml @@ -71,3 +71,4 @@ asm = ["usdt/asm"] default = [] zfs_snapshot = [] integration-tests = [] # Enables creating SQLite volumes +omicron-build = [] diff --git a/downstairs/src/extent.rs b/downstairs/src/extent.rs index a327cc8af..8aec80dc7 100644 --- a/downstairs/src/extent.rs +++ b/downstairs/src/extent.rs @@ -42,6 +42,15 @@ pub(crate) trait ExtentInner: Send + Debug { job_id: JobOrReconciliationId, ) -> Result<(), CrucibleError>; + /// After both user data and any extent metadata have been flushed to + /// durable storage, perform accounting or clean-up. + fn post_flush( + &mut self, + new_flush: u64, + new_gen: u64, + job_id: JobOrReconciliationId, + ) -> Result<(), CrucibleError>; + fn read( &mut self, job_id: JobId, @@ -590,6 +599,18 @@ impl Extent { inner.flush(new_flush, new_gen, job_id) } + #[instrument] + pub(crate) async fn post_flush + Debug>( + &self, + new_flush: u64, + new_gen: u64, + id: I, // only used for logging + ) -> Result<(), CrucibleError> { + let job_id = id.into(); + let mut inner = self.inner.lock().await; + inner.post_flush(new_flush, new_gen, job_id) + } + pub async fn get_meta_info(&self) -> ExtentMeta { let inner = self.inner.lock().await; ExtentMeta { diff --git a/downstairs/src/extent_inner_raw.rs b/downstairs/src/extent_inner_raw.rs index ecf17a0a4..fbecc87e1 100644 --- a/downstairs/src/extent_inner_raw.rs +++ b/downstairs/src/extent_inner_raw.rs @@ -325,24 +325,42 @@ impl ExtentInner for RawInner { // operation atomic. self.set_flush_number(new_flush, new_gen)?; - // Now, we fsync to ensure data is flushed to disk. It's okay to crash - // before this point, because setting the flush number is atomic. - cdt::extent__flush__file__start!(|| { - (job_id.get(), self.extent_number, 0) - }); - if let Err(e) = self.file.sync_all() { - /* - * XXX Retry? Mark extent as broken? - */ - return Err(CrucibleError::IoError(format!( - "extent {}: fsync 1 failure: {e:?}", - self.extent_number, - ))); + if cfg!(feature = "omicron-build") { + // no-op, FIOFFS will be called in region_flush + } else { + // Now, we fsync to ensure data is flushed to disk. It's okay to crash + // before this point, because setting the flush number is atomic. + cdt::extent__flush__file__start!(|| { + (job_id.get(), self.extent_number, 0) + }); + + if let Err(e) = self.file.sync_all() { + /* + * XXX Retry? Mark extent as broken? + */ + return Err(CrucibleError::IoError(format!( + "extent {}: fsync 1 failure: {e:?}", + self.extent_number, + ))); + } + + cdt::extent__flush__file__done!(|| { + (job_id.get(), self.extent_number, 0) + }); } + + cdt::extent__flush__done!(|| { (job_id.get(), self.extent_number, 0) }); + + Ok(()) + } + + fn post_flush( + &mut self, + _new_flush: u64, + _new_gen: u64, + _job_id: JobOrReconciliationId, + ) -> Result<(), CrucibleError> { self.context_slot_dirty.fill(0); - cdt::extent__flush__file__done!(|| { - (job_id.get(), self.extent_number, 0) - }); // Check for fragmentation in the context slots leading to worse // performance, and defragment if that's the case. @@ -352,15 +370,11 @@ impl ExtentInner for RawInner { .unwrap_or(0); self.extra_syscall_count = 0; self.extra_syscall_denominator = 0; - let r = if extra_syscalls_per_rw > DEFRAGMENT_THRESHOLD { + if extra_syscalls_per_rw > DEFRAGMENT_THRESHOLD { self.defragment() } else { Ok(()) - }; - - cdt::extent__flush__done!(|| { (job_id.get(), self.extent_number, 0) }); - - r + } } #[cfg(test)] @@ -1852,6 +1866,7 @@ mod test { // Flush! This will mark all slots as synched inner.flush(12, 12, JobId(11).into())?; + inner.post_flush(12, 12, JobId(11).into())?; assert_eq!(inner.active_context[0], ContextSlot::B); assert_eq!(inner.context_slot_dirty[0], 0b00); @@ -1904,6 +1919,7 @@ mod test { // Flush! This will mark all slots as synched inner.flush(12, 12, JobId(11).into())?; + inner.post_flush(12, 12, JobId(11).into())?; assert_eq!(inner.active_context[0], ContextSlot::A); assert_eq!(inner.context_slot_dirty[0], 0b00); @@ -2029,6 +2045,7 @@ mod test { assert_eq!(inner.extra_syscall_count, 0); assert_eq!(inner.extra_syscall_denominator, 5); inner.flush(10, 10, JobId(10).into())?; + inner.post_flush(10, 10, JobId(10).into())?; assert!(inner.context_slot_dirty.iter().all(|v| *v == 0)); // This should not have changed active context slots! @@ -2080,6 +2097,7 @@ mod test { // trigger defragmentation; after the flush, every context slot should // be in array A. inner.flush(11, 11, JobId(11).into())?; + inner.post_flush(11, 11, JobId(11).into())?; for i in 0..10 { assert_eq!( @@ -2132,6 +2150,7 @@ mod test { assert_eq!(inner.extra_syscall_count, 0); assert_eq!(inner.extra_syscall_denominator, 3); inner.flush(10, 10, JobId(10).into())?; + inner.post_flush(10, 10, JobId(10).into())?; // This should not have changed active context slots! for i in 0..10 { @@ -2185,6 +2204,7 @@ mod test { // trigger defragmentation; after the flush, every context slot should // be in array B (which minimizes copies) inner.flush(11, 11, JobId(11).into())?; + inner.post_flush(11, 11, JobId(11).into())?; for i in 0..10 { assert_eq!( @@ -2239,6 +2259,7 @@ mod test { assert_eq!(inner.extra_syscall_count, 0); assert_eq!(inner.extra_syscall_denominator, 3); inner.flush(10, 10, JobId(10).into())?; + inner.post_flush(10, 10, JobId(10).into())?; // This should not have changed active context slots! for i in 0..10 { @@ -2293,6 +2314,7 @@ mod test { // trigger defragmentation; after the flush, every context slot should // be in array A (which minimizes copies) inner.flush(11, 11, JobId(11).into())?; + inner.post_flush(11, 11, JobId(11).into())?; for i in 0..10 { assert_eq!( @@ -2347,6 +2369,7 @@ mod test { assert_eq!(inner.extra_syscall_count, 0); assert_eq!(inner.extra_syscall_denominator, 2); inner.flush(10, 10, JobId(10).into())?; + inner.post_flush(10, 10, JobId(10).into())?; // This should not have changed active context slots! for i in 0..10 { @@ -2400,6 +2423,7 @@ mod test { // This write didn't add enough extra syscalls to trigger // defragmentation. inner.flush(11, 11, JobId(11).into())?; + inner.post_flush(11, 11, JobId(11).into())?; // These should be the same! for i in 0..10 { diff --git a/downstairs/src/extent_inner_sqlite.rs b/downstairs/src/extent_inner_sqlite.rs index d58a305a4..3d0a84e85 100644 --- a/downstairs/src/extent_inner_sqlite.rs +++ b/downstairs/src/extent_inner_sqlite.rs @@ -75,6 +75,18 @@ impl ExtentInner for SqliteInner { new_gen: u64, job_id: JobOrReconciliationId, ) -> Result<(), CrucibleError> { + if !self.dirty()? { + /* + * If we have made no writes to this extent since the last flush, + * we do not need to update the extent on disk + */ + return Ok(()); + } + + // We put all of our metadata updates into a single write to make this + // operation atomic. + self.set_flush_number(new_flush, new_gen)?; + // Used for profiling let n_dirty_blocks = self.dirty_blocks.len() as u64; @@ -82,26 +94,44 @@ impl ExtentInner for SqliteInner { (job_id.get(), self.extent_number, n_dirty_blocks) }); - /* - * We must first fsync to get any outstanding data written to disk. - * This must be done before we update the flush number. - */ - cdt::extent__flush__file__start!(|| { - (job_id.get(), self.extent_number, n_dirty_blocks) - }); - if let Err(e) = self.file.sync_all() { + if cfg!(feature = "omicron-build") { + // no-op, FIOFFS will be called in region_flush + } else { /* - * XXX Retry? Mark extent as broken? + * We must first fsync to get any outstanding data written to disk. + * This must be done before we update the flush number. */ - crucible_bail!( - IoError, - "extent {}: fsync 1 failure: {e:?}", - self.extent_number, - ); + cdt::extent__flush__file__start!(|| { + (job_id.get(), self.extent_number, n_dirty_blocks) + }); + + if let Err(e) = self.file.sync_all() { + /* + * XXX Retry? Mark extent as broken? + */ + crucible_bail!( + IoError, + "extent {}: fsync 1 failure: {e:?}", + self.extent_number, + ); + } + + cdt::extent__flush__file__done!(|| { + (job_id.get(), self.extent_number, n_dirty_blocks) + }); } - cdt::extent__flush__file__done!(|| { - (job_id.get(), self.extent_number, n_dirty_blocks) - }); + + Ok(()) + } + + fn post_flush( + &mut self, + new_flush: u64, + new_gen: u64, + job_id: JobOrReconciliationId, + ) -> Result<(), CrucibleError> { + // Used for profiling + let n_dirty_blocks = self.dirty_blocks.len() as u64; // Clear old block contexts. In order to be crash consistent, only // perform this after the extent fsync is done. For each block diff --git a/downstairs/src/region.rs b/downstairs/src/region.rs index 76105ca51..e9c75326e 100644 --- a/downstairs/src/region.rs +++ b/downstairs/src/region.rs @@ -807,7 +807,7 @@ impl Region { */ #[instrument] pub(crate) async fn region_flush_extent< - I: Into + Debug, + I: Into + Debug + Copy, >( &self, eid: usize, @@ -824,10 +824,13 @@ impl Region { ); let extent = self.get_opened_extent(eid).await; + extent .flush(flush_number, gen_number, job_id, &self.log) .await?; + extent.post_flush(flush_number, gen_number, job_id).await?; + Ok(()) } @@ -905,6 +908,43 @@ impl Region { result??; } + if cfg!(feature = "omicron-build") { + // If using `omicron-build`, then we're running on illumos and the + // region is backed by a ZFS dataset. Issue a _FIOFFS call, which + // will result in a `zfs_sync` to the entire region dataset. If this + // feature is enabled then the `extent.flush` calls above will _not_ + // sync their data, but will update the flush and gen numbers and + // clear the dirty bit. + use std::io; + use std::os::fd::AsRawFd; + use std::ptr; + + // "file system flush", defined in illumos' sys/filio.h + const FIOFFS: libc::c_int = 0x20000000 | (0x66 /*'f'*/ << 8) | 0x42 /*66*/; + + // Open the region's mountpoint + let file = File::open(&self.dir)?; + + let rc = unsafe { + libc::ioctl( + file.as_raw_fd(), + FIOFFS as _, + ptr::null_mut::(), + ) + }; + + if rc != 0 { + return Err(io::Error::last_os_error().into()); + } + } + + // After the bits have been committed to durable storage, execute any post flush routine + // that needs to happen + for eid in &dirty_extents { + let extent = self.get_opened_extent(*eid).await; + extent.post_flush(flush_number, gen_number, job_id).await?; + } + // Now everything has succeeded, we can remove these extents from the // flush candidates match extent_limit {