Avoid writes that span ZFS records

mkeeter · mkeeter · commit a870b1e634a1 · 2024-04-22T17:53:42.000-04:00
diff --git a/downstairs/src/extent_inner_raw_v2.rs b/downstairs/src/extent_inner_raw_v2.rs
@@ -42,6 +42,7 @@ struct PackedBlockContext {
 
 const ENCRYPTED_BLOCK_CONTEXT: u32 = 1;
 const UNENCRYPTED_BLOCK_CONTEXT: u32 = 2;
+const ZFS_RECORDSIZE: u64 = 128 * 1024;
 
 const BLOCK_CONTEXT_SIZE_BYTES: u64 =
     std::mem::size_of::<PackedBlockContext>() as u64;
@@ -511,17 +512,21 @@ impl RawInnerV2 {
 
         let (start_pos, mut parity) =
             self.layout.block_pos(writes[0].offset.value);
+        let start_block = writes[0].offset;
+        let mut block = start_block.value;
 
         // TODO we're overestimating capacity here, because we can write
         // contexts in pairs.
         let mut iovecs = Vec::with_capacity(n_blocks * 2);
 
         let mut ctx_slice = ctxs.as_slice();
         let mut write_slice = writes;
+        let padding = vec![0u8; self.layout.padding_size() as usize];
         while !write_slice.is_empty() || !ctx_slice.is_empty() {
             match parity {
                 Parity::FirstBlock => {
                     iovecs.push(IoSlice::new(&write_slice[0].data));
+                    block += 1;
                     write_slice = &write_slice[1..];
                     parity = Parity::FirstContext;
                 }
@@ -547,13 +552,15 @@ impl RawInnerV2 {
                     // contexts).
                     iovecs.push(IoSlice::new(&write_slice[0].data));
                     write_slice = &write_slice[1..];
+                    if self.layout.has_padding_after(block) {
+                        iovecs.push(IoSlice::new(&padding));
+                    }
                     parity = Parity::FirstBlock;
+                    block += 1;
                 }
             }
         }
 
-        let start_block = writes[0].offset;
-
         let expected_bytes =
             n_blocks as u64 * (block_size as u64 + BLOCK_CONTEXT_SIZE_BYTES);
 
@@ -637,52 +644,88 @@ impl RawInnerV2 {
         let mut buf_slice = &mut buf[..];
         let mut ctx_slice = &mut ctxs[..];
 
+        // This is awkward: we know how many blocks and contexts we're reading,
+        // and have pre-allocated data for them.  However, we don't know how
+        // many chunks of padding we may need to read!  As such, we'll store a
+        // `Vec<Option<IoSliceMut>>`, and use `None` to represent padding reads;
+        // then, we'll go through and splice them in once we know their total
+        // size.
+        let mut block = start_block.value;
+        let mut padding_count = 0;
         while !ctx_slice.is_empty() || !buf_slice.is_empty() {
             match parity {
                 Parity::FirstBlock => {
                     let (b, next) = buf_slice.split_at_mut(block_size);
-                    iovecs.push(IoSliceMut::new(b));
+                    iovecs.push(Some(IoSliceMut::new(b)));
                     buf_slice = next;
                     parity = Parity::FirstContext;
+                    block += 1;
                 }
                 Parity::FirstContext => {
                     if ctx_slice.len() > 1 {
                         let (b, next) = ctx_slice.split_at_mut(2);
-                        iovecs.push(IoSliceMut::new(b.as_bytes_mut()));
+                        iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut())));
                         ctx_slice = next;
                         parity = Parity::SecondBlock;
                     } else {
                         let (b, next) = ctx_slice.split_at_mut(1);
-                        iovecs.push(IoSliceMut::new(b.as_bytes_mut()));
+                        iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut())));
                         ctx_slice = next;
                         parity = Parity::SecondContext;
                     }
                 }
                 Parity::SecondContext => {
                     let (b, next) = ctx_slice.split_at_mut(1);
-                    iovecs.push(IoSliceMut::new(b.as_bytes_mut()));
+                    iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut())));
                     ctx_slice = next;
                     parity = Parity::SecondBlock;
                 }
                 Parity::SecondBlock => {
-                    if buf_slice.len() > block_size {
+                    let has_padding = self.layout.has_padding_after(block);
+                    if buf_slice.len() > block_size && !has_padding {
                         let (b, next) = buf_slice.split_at_mut(block_size * 2);
-                        iovecs.push(IoSliceMut::new(b));
+                        iovecs.push(Some(IoSliceMut::new(b)));
                         buf_slice = next;
                         parity = Parity::FirstContext;
+                        block += 2;
                     } else {
                         let (b, next) = buf_slice.split_at_mut(block_size);
-                        iovecs.push(IoSliceMut::new(b));
+                        iovecs.push(Some(IoSliceMut::new(b)));
                         buf_slice = next;
+                        if has_padding {
+                            iovecs.push(None);
+                            padding_count += 1;
+                        }
                         parity = Parity::FirstBlock;
+                        block += 1;
                     }
                 }
             }
         }
 
-        let expected_bytes =
+        // How many bytes do we expect `preadv` to return?
+        let mut expected_bytes =
             n_blocks as u64 * (block_size as u64 + BLOCK_CONTEXT_SIZE_BYTES);
 
+        // Now that we know the total number of padded reads, replace the `None`
+        // with borrowed chunks of a dummy array (`padding`) and unwrap all of
+        // the IoVecs.
+        let mut padding = vec![];
+        if padding_count > 0 {
+            let padding_size = self.layout.padding_size() as usize;
+            padding.resize(padding_size * padding_count, 0u8);
+            expected_bytes += padding.len() as u64;
+            for (iov, p) in iovecs
+                .iter_mut()
+                .filter(|b| b.is_none())
+                .zip(padding.chunks_mut(padding_size))
+            {
+                *iov = Some(IoSliceMut::new(p));
+            }
+        }
+        let mut iovecs: Vec<_> =
+            iovecs.into_iter().map(Option::unwrap).collect();
+
         // Finally we get to read the actual data. That's why we're here
         cdt::extent__read__file__start!(|| {
             (job_id.0, self.extent_number, n_blocks as u64)
@@ -778,7 +821,19 @@ impl RawLayout {
 
     /// Returns the byte offset of the `block_written` bitpacked array
     fn block_written_array_offset(&self) -> u64 {
-        self.block_count() * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES)
+        let bpr = self.blocks_per_record();
+        let bc = self.block_count();
+
+        if bc % bpr == 0 {
+            (bc / bpr) * ZFS_RECORDSIZE
+        } else {
+            let record_count = bc / bpr;
+            let trailing_blocks = bc - record_count * bpr;
+
+            record_count * ZFS_RECORDSIZE
+                + trailing_blocks
+                    * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES)
+        }
     }
 
     /// Returns the size of the `block_written` bitpacked array, in bytes
@@ -885,7 +940,11 @@ impl RawLayout {
     ///
     /// This offset could either be block data or context, depending on parity!
     fn block_pos(&self, block: u64) -> (u64, Parity) {
-        let pos = block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
+        let bpr = self.blocks_per_record();
+        let record = block / bpr;
+        let block = block % bpr;
+        let pos = record * ZFS_RECORDSIZE
+            + block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
         let parity = match block % 2 {
             0 => Parity::FirstBlock,
             1 => Parity::SecondContext,
@@ -896,13 +955,43 @@ impl RawLayout {
 
     /// Returns the position of the given block's context
     fn context_slot(&self, block: u64) -> u64 {
-        let pos = block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
+        let bpr = self.blocks_per_record();
+        let record = block / bpr;
+        let block = block % bpr;
+        let pos = record * ZFS_RECORDSIZE
+            + block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
         match block % 2 {
             0 => pos + self.block_size(),
             1 => pos,
             _ => unreachable!(),
         }
     }
+
+    /// Returns the number of blocks that fit into a ZFS recordsize
+    fn blocks_per_record(&self) -> u64 {
+        // Each block contains data and a single context slot
+        let bytes_per_block = self.block_size() + BLOCK_CONTEXT_SIZE_BYTES;
+        // We guarantee that there are an even number of blocks per record, for
+        // simplicity (so that padding always comes after `Parity::SecondBlock`)
+        2 * (ZFS_RECORDSIZE / (2 * bytes_per_block))
+    }
+
+    /// Checks whether there is padding after the given block
+    fn has_padding_after(&self, block: u64) -> bool {
+        // No padding at the end of the file
+        if block == self.block_count() - 1 {
+            return false;
+        }
+        // Otherwise, there's padding at the end of each block-pair-group
+        let bpr = self.blocks_per_record();
+        (block % bpr) == bpr - 1
+    }
+
+    /// Returns the size of `recordsize` padding
+    fn padding_size(&self) -> u64 {
+        let bpr = self.blocks_per_record();
+        ZFS_RECORDSIZE - bpr * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES)
+    }
 }
 
 /// Represents position in a block-context pair.