Use FIOFFS on illumos instead of fsync

jmpesp · jmpesp · commit ec40e2f1d27d · 2024-02-06T16:40:24.000-05:00
illumos supports sending a FIOFFS ioctl to a ZFS dataset in order to
sync _all_ outstanding IO. Use this ioctl in `region_flush` for the
whole region dataset instead of calling `sync_all` for each extent file.
This is hidden behind a new `omicron-build` feature, which is true for
our production builds.

This necessitated separating flush (which commits bits to durable
storage and is a no-op if `omicron-build` is used) and `post_flush`
(which performs clean up or any other kind of accounting). Splitting
this up exposed a bug in the sqlite implementation of ExtentInner: it
wasn't checking to see if the extent was dirty and wasn't calling
`set_flush_number` before calling `sync_all` on the user data file.
diff --git a/downstairs/Cargo.toml b/downstairs/Cargo.toml
@@ -71,3 +71,4 @@ asm = ["usdt/asm"]
 default = []
 zfs_snapshot = []
 integration-tests = [] # Enables creating SQLite volumes
+omicron-build = []
diff --git a/downstairs/src/extent.rs b/downstairs/src/extent.rs
@@ -42,6 +42,15 @@ pub(crate) trait ExtentInner: Send + Debug {
         job_id: JobOrReconciliationId,
     ) -> Result<(), CrucibleError>;
 
+    /// After both user data and any extent metadata have been flushed to
+    /// durable storage, perform accounting or clean-up.
+    fn post_flush(
+        &mut self,
+        new_flush: u64,
+        new_gen: u64,
+        job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError>;
+
     fn read(
         &mut self,
         job_id: JobId,
@@ -590,6 +599,18 @@ impl Extent {
         inner.flush(new_flush, new_gen, job_id)
     }
 
+    #[instrument]
+    pub(crate) async fn post_flush<I: Into<JobOrReconciliationId> + Debug>(
+        &self,
+        new_flush: u64,
+        new_gen: u64,
+        id: I, // only used for logging
+    ) -> Result<(), CrucibleError> {
+        let job_id = id.into();
+        let mut inner = self.inner.lock().await;
+        inner.post_flush(new_flush, new_gen, job_id)
+    }
+
     pub async fn get_meta_info(&self) -> ExtentMeta {
         let inner = self.inner.lock().await;
         ExtentMeta {
diff --git a/downstairs/src/extent_inner_raw.rs b/downstairs/src/extent_inner_raw.rs
@@ -325,24 +325,42 @@ impl ExtentInner for RawInner {
         // operation atomic.
         self.set_flush_number(new_flush, new_gen)?;
 
-        // Now, we fsync to ensure data is flushed to disk.  It's okay to crash
-        // before this point, because setting the flush number is atomic.
-        cdt::extent__flush__file__start!(|| {
-            (job_id.get(), self.extent_number, 0)
-        });
-        if let Err(e) = self.file.sync_all() {
-            /*
-             * XXX Retry?  Mark extent as broken?
-             */
-            return Err(CrucibleError::IoError(format!(
-                "extent {}: fsync 1 failure: {e:?}",
-                self.extent_number,
-            )));
+        if cfg!(feature = "omicron-build") {
+            // no-op, FIOFFS will be called in region_flush
+        } else {
+            // Now, we fsync to ensure data is flushed to disk.  It's okay to crash
+            // before this point, because setting the flush number is atomic.
+            cdt::extent__flush__file__start!(|| {
+                (job_id.get(), self.extent_number, 0)
+            });
+
+            if let Err(e) = self.file.sync_all() {
+                /*
+                 * XXX Retry?  Mark extent as broken?
+                 */
+                return Err(CrucibleError::IoError(format!(
+                    "extent {}: fsync 1 failure: {e:?}",
+                    self.extent_number,
+                )));
+            }
+
+            cdt::extent__flush__file__done!(|| {
+                (job_id.get(), self.extent_number, 0)
+            });
         }
+
+        cdt::extent__flush__done!(|| { (job_id.get(), self.extent_number, 0) });
+
+        Ok(())
+    }
+
+    fn post_flush(
+        &mut self,
+        _new_flush: u64,
+        _new_gen: u64,
+        _job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError> {
         self.context_slot_dirty.fill(0);
-        cdt::extent__flush__file__done!(|| {
-            (job_id.get(), self.extent_number, 0)
-        });
 
         // Check for fragmentation in the context slots leading to worse
         // performance, and defragment if that's the case.
@@ -352,15 +370,11 @@ impl ExtentInner for RawInner {
             .unwrap_or(0);
         self.extra_syscall_count = 0;
         self.extra_syscall_denominator = 0;
-        let r = if extra_syscalls_per_rw > DEFRAGMENT_THRESHOLD {
+        if extra_syscalls_per_rw > DEFRAGMENT_THRESHOLD {
             self.defragment()
         } else {
             Ok(())
-        };
-
-        cdt::extent__flush__done!(|| { (job_id.get(), self.extent_number, 0) });
-
-        r
+        }
     }
 
     #[cfg(test)]
@@ -1852,6 +1866,7 @@ mod test {
 
         // Flush!  This will mark all slots as synched
         inner.flush(12, 12, JobId(11).into())?;
+        inner.post_flush(12, 12, JobId(11).into())?;
         assert_eq!(inner.active_context[0], ContextSlot::B);
         assert_eq!(inner.context_slot_dirty[0], 0b00);
 
@@ -1904,6 +1919,7 @@ mod test {
 
         // Flush!  This will mark all slots as synched
         inner.flush(12, 12, JobId(11).into())?;
+        inner.post_flush(12, 12, JobId(11).into())?;
         assert_eq!(inner.active_context[0], ContextSlot::A);
         assert_eq!(inner.context_slot_dirty[0], 0b00);
 
@@ -2029,6 +2045,7 @@ mod test {
         assert_eq!(inner.extra_syscall_count, 0);
         assert_eq!(inner.extra_syscall_denominator, 5);
         inner.flush(10, 10, JobId(10).into())?;
+        inner.post_flush(10, 10, JobId(10).into())?;
         assert!(inner.context_slot_dirty.iter().all(|v| *v == 0));
 
         // This should not have changed active context slots!
@@ -2080,6 +2097,7 @@ mod test {
         // trigger defragmentation; after the flush, every context slot should
         // be in array A.
         inner.flush(11, 11, JobId(11).into())?;
+        inner.post_flush(11, 11, JobId(11).into())?;
 
         for i in 0..10 {
             assert_eq!(
@@ -2132,6 +2150,7 @@ mod test {
         assert_eq!(inner.extra_syscall_count, 0);
         assert_eq!(inner.extra_syscall_denominator, 3);
         inner.flush(10, 10, JobId(10).into())?;
+        inner.post_flush(10, 10, JobId(10).into())?;
 
         // This should not have changed active context slots!
         for i in 0..10 {
@@ -2185,6 +2204,7 @@ mod test {
         // trigger defragmentation; after the flush, every context slot should
         // be in array B (which minimizes copies)
         inner.flush(11, 11, JobId(11).into())?;
+        inner.post_flush(11, 11, JobId(11).into())?;
 
         for i in 0..10 {
             assert_eq!(
@@ -2239,6 +2259,7 @@ mod test {
         assert_eq!(inner.extra_syscall_count, 0);
         assert_eq!(inner.extra_syscall_denominator, 3);
         inner.flush(10, 10, JobId(10).into())?;
+        inner.post_flush(10, 10, JobId(10).into())?;
 
         // This should not have changed active context slots!
         for i in 0..10 {
@@ -2293,6 +2314,7 @@ mod test {
         // trigger defragmentation; after the flush, every context slot should
         // be in array A (which minimizes copies)
         inner.flush(11, 11, JobId(11).into())?;
+        inner.post_flush(11, 11, JobId(11).into())?;
 
         for i in 0..10 {
             assert_eq!(
@@ -2347,6 +2369,7 @@ mod test {
         assert_eq!(inner.extra_syscall_count, 0);
         assert_eq!(inner.extra_syscall_denominator, 2);
         inner.flush(10, 10, JobId(10).into())?;
+        inner.post_flush(10, 10, JobId(10).into())?;
 
         // This should not have changed active context slots!
         for i in 0..10 {
@@ -2400,6 +2423,7 @@ mod test {
         // This write didn't add enough extra syscalls to trigger
         // defragmentation.
         inner.flush(11, 11, JobId(11).into())?;
+        inner.post_flush(11, 11, JobId(11).into())?;
 
         // These should be the same!
         for i in 0..10 {
diff --git a/downstairs/src/extent_inner_sqlite.rs b/downstairs/src/extent_inner_sqlite.rs
@@ -75,33 +75,63 @@ impl ExtentInner for SqliteInner {
         new_gen: u64,
         job_id: JobOrReconciliationId,
     ) -> Result<(), CrucibleError> {
+        if !self.dirty()? {
+            /*
+             * If we have made no writes to this extent since the last flush,
+             * we do not need to update the extent on disk
+             */
+            return Ok(());
+        }
+
+        // We put all of our metadata updates into a single write to make this
+        // operation atomic.
+        self.set_flush_number(new_flush, new_gen)?;
+
         // Used for profiling
         let n_dirty_blocks = self.dirty_blocks.len() as u64;
 
         cdt::extent__flush__start!(|| {
             (job_id.get(), self.extent_number, n_dirty_blocks)
         });
 
-        /*
-         * We must first fsync to get any outstanding data written to disk.
-         * This must be done before we update the flush number.
-         */
-        cdt::extent__flush__file__start!(|| {
-            (job_id.get(), self.extent_number, n_dirty_blocks)
-        });
-        if let Err(e) = self.file.sync_all() {
+        if cfg!(feature = "omicron-build") {
+            // no-op, FIOFFS will be called in region_flush
+        } else {
             /*
-             * XXX Retry?  Mark extent as broken?
+             * We must first fsync to get any outstanding data written to disk.
+             * This must be done before we update the flush number.
              */
-            crucible_bail!(
-                IoError,
-                "extent {}: fsync 1 failure: {e:?}",
-                self.extent_number,
-            );
+            cdt::extent__flush__file__start!(|| {
+                (job_id.get(), self.extent_number, n_dirty_blocks)
+            });
+
+            if let Err(e) = self.file.sync_all() {
+                /*
+                 * XXX Retry?  Mark extent as broken?
+                 */
+                crucible_bail!(
+                    IoError,
+                    "extent {}: fsync 1 failure: {e:?}",
+                    self.extent_number,
+                );
+            }
+
+            cdt::extent__flush__file__done!(|| {
+                (job_id.get(), self.extent_number, n_dirty_blocks)
+            });
         }
-        cdt::extent__flush__file__done!(|| {
-            (job_id.get(), self.extent_number, n_dirty_blocks)
-        });
+
+        Ok(())
+    }
+
+    fn post_flush(
+        &mut self,
+        new_flush: u64,
+        new_gen: u64,
+        job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError> {
+        // Used for profiling
+        let n_dirty_blocks = self.dirty_blocks.len() as u64;
 
         // Clear old block contexts. In order to be crash consistent, only
         // perform this after the extent fsync is done. For each block
diff --git a/downstairs/src/region.rs b/downstairs/src/region.rs
@@ -807,7 +807,7 @@ impl Region {
      */
     #[instrument]
     pub(crate) async fn region_flush_extent<
-        I: Into<JobOrReconciliationId> + Debug,
+        I: Into<JobOrReconciliationId> + Debug + Copy,
     >(
         &self,
         eid: usize,
@@ -824,10 +824,13 @@ impl Region {
         );
 
         let extent = self.get_opened_extent(eid).await;
+
         extent
             .flush(flush_number, gen_number, job_id, &self.log)
             .await?;
 
+        extent.post_flush(flush_number, gen_number, job_id).await?;
+
         Ok(())
     }
 
@@ -905,6 +908,37 @@ impl Region {
             result??;
         }
 
+        if cfg!(feature = "omicron-build") {
+            use std::io;
+            use std::os::fd::AsRawFd;
+            use std::ptr;
+
+            // "file system flush", defined in illumos' sys/filio.h
+            const FIOFFS: libc::c_int = 0x20000000 | (0x66 /*'f'*/ << 8) | 0x42 /*66*/;
+
+            // Open the region's mountpoint
+            let file = File::open(&self.dir)?;
+
+            let rc = unsafe {
+                libc::ioctl(
+                    file.as_raw_fd(),
+                    FIOFFS as _,
+                    ptr::null_mut::<i32>(),
+                )
+            };
+
+            if rc != 0 {
+                return Err(io::Error::last_os_error().into());
+            }
+        }
+
+        // After the bits have been committed to durable storage, execute any post flush routine
+        // that needs to happen
+        for eid in &dirty_extents {
+            let extent = self.get_opened_extent(*eid).await;
+            extent.post_flush(flush_number, gen_number, job_id).await?;
+        }
+
         // Now everything has succeeded, we can remove these extents from the
         // flush candidates
         match extent_limit {