Use FIOFFS instead of fsync (on supported platforms)

mkeeter · mkeeter · commit 30cdad64858a · 2024-05-13T09:32:33.000-04:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -61,7 +61,7 @@ itertools = "0.12.1"
 libc = "0.2"
 mime_guess = "2.0.4"
 nbd = "0.2.3"
-nix = { version = "0.28", features = [ "feature", "uio" ] }
+nix = { version = "0.28", features = [ "feature", "uio", "ioctl" ] }
 num_enum = "0.7"
 num-derive = "0.4"
 num-traits = "0.2"
diff --git a/downstairs/Cargo.toml b/downstairs/Cargo.toml
@@ -71,3 +71,4 @@ asm = ["usdt/asm"]
 default = []
 zfs_snapshot = []
 integration-tests = [] # Enables creating SQLite volumes
+omicron-build = [] # Uses FIOFSS for flushes instead of fsync
diff --git a/downstairs/src/extent.rs b/downstairs/src/extent.rs
@@ -34,13 +34,45 @@ pub(crate) trait ExtentInner: Send + Sync + Debug {
     fn flush_number(&self) -> Result<u64, CrucibleError>;
     fn dirty(&self) -> Result<bool, CrucibleError>;
 
-    fn flush(
+    /// Performs any metadata updates needed before a flush
+    fn pre_flush(
+        &mut self,
+        new_flush: u64,
+        new_gen: u64,
+        job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError>;
+
+    /// Syncs all relevant data to persistant storage
+    fn flush_inner(
+        &mut self,
+        job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError>;
+
+    /// Performs any metadata updates after syncing data to persistent storage
+    fn post_flush(
         &mut self,
         new_flush: u64,
         new_gen: u64,
         job_id: JobOrReconciliationId,
     ) -> Result<(), CrucibleError>;
 
+    /// Performs a full flush (pre/inner/post)
+    ///
+    /// This is only exposed for the sake of unit testing; normal code should
+    /// use the fine-grained functions and be forced to consider performance.
+    #[cfg(test)]
+    fn flush(
+        &mut self,
+        new_flush: u64,
+        new_gen: u64,
+        job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError> {
+        self.pre_flush(new_flush, new_gen, job_id)?;
+        self.flush_inner(job_id)?;
+        self.post_flush(new_flush, new_gen, job_id)?;
+        Ok(())
+    }
+
     fn read(
         &mut self,
         job_id: JobId,
@@ -579,22 +611,25 @@ impl Extent {
         Ok(())
     }
 
-    #[instrument]
-    pub(crate) async fn flush<I: Into<JobOrReconciliationId> + Debug>(
+    /// Prepares for a flush
+    ///
+    /// Returns `false` if we should skip the flush (because this extent is not
+    /// dirty), or `true` if we should proceed.
+    pub(crate) fn pre_flush<I: Into<JobOrReconciliationId> + Debug>(
         &mut self,
         new_flush: u64,
         new_gen: u64,
         id: I, // only used for logging
         log: &Logger,
-    ) -> Result<(), CrucibleError> {
+    ) -> Result<bool, CrucibleError> {
         let job_id: JobOrReconciliationId = id.into();
 
         if !self.inner.dirty()? {
             /*
              * If we have made no writes to this extent since the last flush,
              * we do not need to update the extent on disk
              */
-            return Ok(());
+            return Ok(false);
         }
 
         // Read only extents should never have the dirty bit set. If they do,
@@ -605,7 +640,41 @@ impl Extent {
             crucible_bail!(ModifyingReadOnlyRegion);
         }
 
-        self.inner.flush(new_flush, new_gen, job_id)
+        self.inner.pre_flush(new_flush, new_gen, job_id)?;
+        Ok(true)
+    }
+
+    /// Prepares for a flush
+    ///
+    /// Returns `false` if we should skip the flush (because this extent is not
+    /// dirty), or `true` if we should proceed.
+    pub(crate) fn post_flush<I: Into<JobOrReconciliationId> + Debug>(
+        &mut self,
+        new_flush: u64,
+        new_gen: u64,
+        id: I, // only used for logging
+    ) -> Result<(), CrucibleError> {
+        let job_id: JobOrReconciliationId = id.into();
+        self.inner.post_flush(new_flush, new_gen, job_id)
+    }
+
+    /// Flushes this extent if it is dirty
+    #[instrument]
+    pub(crate) async fn flush<
+        I: Into<JobOrReconciliationId> + Debug + Copy + Clone,
+    >(
+        &mut self,
+        new_flush: u64,
+        new_gen: u64,
+        id: I, // only used for logging
+        log: &Logger,
+    ) -> Result<(), CrucibleError> {
+        if !self.pre_flush(new_flush, new_gen, id, log)? {
+            return Ok(());
+        }
+        self.inner.flush_inner(id.into())?;
+        self.post_flush(new_flush, new_gen, id)?;
+        Ok(())
     }
 
     #[allow(clippy::unused_async)] // this will be async again in the future
diff --git a/downstairs/src/extent_inner_raw.rs b/downstairs/src/extent_inner_raw.rs
@@ -246,20 +246,12 @@ impl ExtentInner for RawInner {
         Ok(ExtentReadResponse { data: buf, blocks })
     }
 
-    fn flush(
+    fn pre_flush(
         &mut self,
         new_flush: u64,
         new_gen: u64,
         job_id: JobOrReconciliationId,
     ) -> Result<(), CrucibleError> {
-        if !self.dirty()? {
-            /*
-             * If we have made no writes to this extent since the last flush,
-             * we do not need to update the extent on disk
-             */
-            return Ok(());
-        }
-
         cdt::extent__flush__start!(|| {
             (job_id.get(), self.extent_number, 0)
         });
@@ -268,11 +260,17 @@ impl ExtentInner for RawInner {
         // operation atomic.
         self.set_flush_number(new_flush, new_gen)?;
 
+        Ok(())
+    }
+
+    fn flush_inner(
+        &mut self,
+        job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError> {
         // Now, we fsync to ensure data is flushed to disk.  It's okay to crash
         // before this point, because setting the flush number is atomic.
-        cdt::extent__flush__file__start!(|| {
-            (job_id.get(), self.extent_number, 0)
-        });
+        cdt::extent__flush__file__start!(|| (job_id.get(), self.extent_number));
+
         if let Err(e) = self.file.sync_all() {
             /*
              * XXX Retry?  Mark extent as broken?
@@ -283,10 +281,16 @@ impl ExtentInner for RawInner {
             )));
         }
         self.context_slot_dirty.fill(0);
-        cdt::extent__flush__file__done!(|| {
-            (job_id.get(), self.extent_number, 0)
-        });
+        cdt::extent__flush__file__done!(|| (job_id.get(), self.extent_number));
+        Ok(())
+    }
 
+    fn post_flush(
+        &mut self,
+        _new_flush: u64,
+        _new_gen: u64,
+        job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError> {
         // Check for fragmentation in the context slots leading to worse
         // performance, and defragment if that's the case.
         let extra_syscalls_per_rw = self
@@ -301,7 +305,7 @@ impl ExtentInner for RawInner {
             Ok(())
         };
 
-        cdt::extent__flush__done!(|| { (job_id.get(), self.extent_number, 0) });
+        cdt::extent__flush__done!(|| (job_id.get(), self.extent_number));
 
         r
     }
diff --git a/downstairs/src/extent_inner_sqlite.rs b/downstairs/src/extent_inner_sqlite.rs
@@ -33,13 +33,32 @@ impl ExtentInner for SqliteInner {
         self.0.lock().unwrap().dirty()
     }
 
-    fn flush(
+    fn pre_flush(
         &mut self,
         new_flush: u64,
         new_gen: u64,
         job_id: JobOrReconciliationId,
     ) -> Result<(), CrucibleError> {
-        self.0.lock().unwrap().flush(new_flush, new_gen, job_id)
+        self.0.lock().unwrap().pre_flush(new_flush, new_gen, job_id)
+    }
+
+    fn flush_inner(
+        &mut self,
+        job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError> {
+        self.0.lock().unwrap().flush_inner(job_id)
+    }
+
+    fn post_flush(
+        &mut self,
+        new_flush: u64,
+        new_gen: u64,
+        job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError> {
+        self.0
+            .lock()
+            .unwrap()
+            .post_flush(new_flush, new_gen, job_id)
     }
 
     fn read(
@@ -191,10 +210,10 @@ impl SqliteMoreInner {
         Ok(self.dirty.get())
     }
 
-    fn flush(
+    fn pre_flush(
         &mut self,
-        new_flush: u64,
-        new_gen: u64,
+        _new_flush: u64,
+        _new_gen: u64,
         job_id: JobOrReconciliationId,
     ) -> Result<(), CrucibleError> {
         // Used for profiling
@@ -204,13 +223,18 @@ impl SqliteMoreInner {
             (job_id.get(), self.extent_number, n_dirty_blocks)
         });
 
+        Ok(())
+    }
+
+    fn flush_inner(
+        &mut self,
+        job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError> {
         /*
          * We must first fsync to get any outstanding data written to disk.
          * This must be done before we update the flush number.
          */
-        cdt::extent__flush__file__start!(|| {
-            (job_id.get(), self.extent_number, n_dirty_blocks)
-        });
+        cdt::extent__flush__file__start!(|| (job_id.get(), self.extent_number));
         if let Err(e) = self.file.sync_all() {
             /*
              * XXX Retry?  Mark extent as broken?
@@ -221,10 +245,17 @@ impl SqliteMoreInner {
                 self.extent_number,
             );
         }
-        cdt::extent__flush__file__done!(|| {
-            (job_id.get(), self.extent_number, n_dirty_blocks)
-        });
+        cdt::extent__flush__file__done!(|| (job_id.get(), self.extent_number));
 
+        Ok(())
+    }
+
+    fn post_flush(
+        &mut self,
+        new_flush: u64,
+        new_gen: u64,
+        job_id: JobOrReconciliationId,
+    ) -> Result<(), CrucibleError> {
         // Clear old block contexts. In order to be crash consistent, only
         // perform this after the extent fsync is done. For each block
         // written since the last flush, remove all block context rows where
@@ -233,9 +264,10 @@ impl SqliteMoreInner {
         // values were written. When the region is first opened, the entire
         // file is rehashed, since in that case we don't have that luxury.
 
-        cdt::extent__flush__collect__hashes__start!(|| {
-            (job_id.get(), self.extent_number, n_dirty_blocks)
-        });
+        cdt::extent__flush__collect__hashes__start!(|| (
+            job_id.get(),
+            self.extent_number
+        ));
 
         // Rehash any parts of the file that we *may have written* data to since
         // the last flush.  (If we know that we wrote the data, then we don't
@@ -246,9 +278,10 @@ impl SqliteMoreInner {
             (job_id.get(), self.extent_number, n_rehashed as u64)
         });
 
-        cdt::extent__flush__sqlite__insert__start!(|| {
-            (job_id.get(), self.extent_number, n_dirty_blocks)
-        });
+        cdt::extent__flush__sqlite__insert__start!(|| (
+            job_id.get(),
+            self.extent_number
+        ));
 
         // We put all of our metadb updates into a single transaction to
         // assure that we have a single sync.
@@ -261,9 +294,10 @@ impl SqliteMoreInner {
             &tx,
         )?;
 
-        cdt::extent__flush__sqlite__insert__done!(|| {
-            (job_id.get(), self.extent_number, n_dirty_blocks)
-        });
+        cdt::extent__flush__sqlite__insert__done!(|| (
+            job_id.get(),
+            self.extent_number
+        ));
 
         self.set_flush_number(new_flush, new_gen)?;
         tx.commit()?;
@@ -272,9 +306,7 @@ impl SqliteMoreInner {
         // Finally, reset the file's seek offset to 0
         self.file.seek(SeekFrom::Start(0))?;
 
-        cdt::extent__flush__done!(|| {
-            (job_id.get(), self.extent_number, n_dirty_blocks)
-        });
+        cdt::extent__flush__done!(|| (job_id.get(), self.extent_number));
         Ok(())
     }
 
diff --git a/downstairs/src/lib.rs b/downstairs/src/lib.rs
diff --git a/downstairs/src/region.rs b/downstairs/src/region.rs