Skip to content

Commit a870b1e

Browse files
committed
Avoid writes that span ZFS records
1 parent 449393a commit a870b1e

File tree

1 file changed

+102
-13
lines changed

1 file changed

+102
-13
lines changed

downstairs/src/extent_inner_raw_v2.rs

+102-13
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ struct PackedBlockContext {
4242

4343
const ENCRYPTED_BLOCK_CONTEXT: u32 = 1;
4444
const UNENCRYPTED_BLOCK_CONTEXT: u32 = 2;
45+
const ZFS_RECORDSIZE: u64 = 128 * 1024;
4546

4647
const BLOCK_CONTEXT_SIZE_BYTES: u64 =
4748
std::mem::size_of::<PackedBlockContext>() as u64;
@@ -511,17 +512,21 @@ impl RawInnerV2 {
511512

512513
let (start_pos, mut parity) =
513514
self.layout.block_pos(writes[0].offset.value);
515+
let start_block = writes[0].offset;
516+
let mut block = start_block.value;
514517

515518
// TODO we're overestimating capacity here, because we can write
516519
// contexts in pairs.
517520
let mut iovecs = Vec::with_capacity(n_blocks * 2);
518521

519522
let mut ctx_slice = ctxs.as_slice();
520523
let mut write_slice = writes;
524+
let padding = vec![0u8; self.layout.padding_size() as usize];
521525
while !write_slice.is_empty() || !ctx_slice.is_empty() {
522526
match parity {
523527
Parity::FirstBlock => {
524528
iovecs.push(IoSlice::new(&write_slice[0].data));
529+
block += 1;
525530
write_slice = &write_slice[1..];
526531
parity = Parity::FirstContext;
527532
}
@@ -547,13 +552,15 @@ impl RawInnerV2 {
547552
// contexts).
548553
iovecs.push(IoSlice::new(&write_slice[0].data));
549554
write_slice = &write_slice[1..];
555+
if self.layout.has_padding_after(block) {
556+
iovecs.push(IoSlice::new(&padding));
557+
}
550558
parity = Parity::FirstBlock;
559+
block += 1;
551560
}
552561
}
553562
}
554563

555-
let start_block = writes[0].offset;
556-
557564
let expected_bytes =
558565
n_blocks as u64 * (block_size as u64 + BLOCK_CONTEXT_SIZE_BYTES);
559566

@@ -637,52 +644,88 @@ impl RawInnerV2 {
637644
let mut buf_slice = &mut buf[..];
638645
let mut ctx_slice = &mut ctxs[..];
639646

647+
// This is awkward: we know how many blocks and contexts we're reading,
648+
// and have pre-allocated data for them. However, we don't know how
649+
// many chunks of padding we may need to read! As such, we'll store a
650+
// `Vec<Option<IoSliceMut>>`, and use `None` to represent padding reads;
651+
// then, we'll go through and splice them in once we know their total
652+
// size.
653+
let mut block = start_block.value;
654+
let mut padding_count = 0;
640655
while !ctx_slice.is_empty() || !buf_slice.is_empty() {
641656
match parity {
642657
Parity::FirstBlock => {
643658
let (b, next) = buf_slice.split_at_mut(block_size);
644-
iovecs.push(IoSliceMut::new(b));
659+
iovecs.push(Some(IoSliceMut::new(b)));
645660
buf_slice = next;
646661
parity = Parity::FirstContext;
662+
block += 1;
647663
}
648664
Parity::FirstContext => {
649665
if ctx_slice.len() > 1 {
650666
let (b, next) = ctx_slice.split_at_mut(2);
651-
iovecs.push(IoSliceMut::new(b.as_bytes_mut()));
667+
iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut())));
652668
ctx_slice = next;
653669
parity = Parity::SecondBlock;
654670
} else {
655671
let (b, next) = ctx_slice.split_at_mut(1);
656-
iovecs.push(IoSliceMut::new(b.as_bytes_mut()));
672+
iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut())));
657673
ctx_slice = next;
658674
parity = Parity::SecondContext;
659675
}
660676
}
661677
Parity::SecondContext => {
662678
let (b, next) = ctx_slice.split_at_mut(1);
663-
iovecs.push(IoSliceMut::new(b.as_bytes_mut()));
679+
iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut())));
664680
ctx_slice = next;
665681
parity = Parity::SecondBlock;
666682
}
667683
Parity::SecondBlock => {
668-
if buf_slice.len() > block_size {
684+
let has_padding = self.layout.has_padding_after(block);
685+
if buf_slice.len() > block_size && !has_padding {
669686
let (b, next) = buf_slice.split_at_mut(block_size * 2);
670-
iovecs.push(IoSliceMut::new(b));
687+
iovecs.push(Some(IoSliceMut::new(b)));
671688
buf_slice = next;
672689
parity = Parity::FirstContext;
690+
block += 2;
673691
} else {
674692
let (b, next) = buf_slice.split_at_mut(block_size);
675-
iovecs.push(IoSliceMut::new(b));
693+
iovecs.push(Some(IoSliceMut::new(b)));
676694
buf_slice = next;
695+
if has_padding {
696+
iovecs.push(None);
697+
padding_count += 1;
698+
}
677699
parity = Parity::FirstBlock;
700+
block += 1;
678701
}
679702
}
680703
}
681704
}
682705

683-
let expected_bytes =
706+
// How many bytes do we expect `preadv` to return?
707+
let mut expected_bytes =
684708
n_blocks as u64 * (block_size as u64 + BLOCK_CONTEXT_SIZE_BYTES);
685709

710+
// Now that we know the total number of padded reads, replace the `None`
711+
// with borrowed chunks of a dummy array (`padding`) and unwrap all of
712+
// the IoVecs.
713+
let mut padding = vec![];
714+
if padding_count > 0 {
715+
let padding_size = self.layout.padding_size() as usize;
716+
padding.resize(padding_size * padding_count, 0u8);
717+
expected_bytes += padding.len() as u64;
718+
for (iov, p) in iovecs
719+
.iter_mut()
720+
.filter(|b| b.is_none())
721+
.zip(padding.chunks_mut(padding_size))
722+
{
723+
*iov = Some(IoSliceMut::new(p));
724+
}
725+
}
726+
let mut iovecs: Vec<_> =
727+
iovecs.into_iter().map(Option::unwrap).collect();
728+
686729
// Finally we get to read the actual data. That's why we're here
687730
cdt::extent__read__file__start!(|| {
688731
(job_id.0, self.extent_number, n_blocks as u64)
@@ -778,7 +821,19 @@ impl RawLayout {
778821

779822
/// Returns the byte offset of the `block_written` bitpacked array
780823
fn block_written_array_offset(&self) -> u64 {
781-
self.block_count() * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES)
824+
let bpr = self.blocks_per_record();
825+
let bc = self.block_count();
826+
827+
if bc % bpr == 0 {
828+
(bc / bpr) * ZFS_RECORDSIZE
829+
} else {
830+
let record_count = bc / bpr;
831+
let trailing_blocks = bc - record_count * bpr;
832+
833+
record_count * ZFS_RECORDSIZE
834+
+ trailing_blocks
835+
* (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES)
836+
}
782837
}
783838

784839
/// Returns the size of the `block_written` bitpacked array, in bytes
@@ -885,7 +940,11 @@ impl RawLayout {
885940
///
886941
/// This offset could either be block data or context, depending on parity!
887942
fn block_pos(&self, block: u64) -> (u64, Parity) {
888-
let pos = block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
943+
let bpr = self.blocks_per_record();
944+
let record = block / bpr;
945+
let block = block % bpr;
946+
let pos = record * ZFS_RECORDSIZE
947+
+ block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
889948
let parity = match block % 2 {
890949
0 => Parity::FirstBlock,
891950
1 => Parity::SecondContext,
@@ -896,13 +955,43 @@ impl RawLayout {
896955

897956
/// Returns the position of the given block's context
898957
fn context_slot(&self, block: u64) -> u64 {
899-
let pos = block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
958+
let bpr = self.blocks_per_record();
959+
let record = block / bpr;
960+
let block = block % bpr;
961+
let pos = record * ZFS_RECORDSIZE
962+
+ block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
900963
match block % 2 {
901964
0 => pos + self.block_size(),
902965
1 => pos,
903966
_ => unreachable!(),
904967
}
905968
}
969+
970+
/// Returns the number of blocks that fit into a ZFS recordsize
971+
fn blocks_per_record(&self) -> u64 {
972+
// Each block contains data and a single context slot
973+
let bytes_per_block = self.block_size() + BLOCK_CONTEXT_SIZE_BYTES;
974+
// We guarantee that there are an even number of blocks per record, for
975+
// simplicity (so that padding always comes after `Parity::SecondBlock`)
976+
2 * (ZFS_RECORDSIZE / (2 * bytes_per_block))
977+
}
978+
979+
/// Checks whether there is padding after the given block
980+
fn has_padding_after(&self, block: u64) -> bool {
981+
// No padding at the end of the file
982+
if block == self.block_count() - 1 {
983+
return false;
984+
}
985+
// Otherwise, there's padding at the end of each block-pair-group
986+
let bpr = self.blocks_per_record();
987+
(block % bpr) == bpr - 1
988+
}
989+
990+
/// Returns the size of `recordsize` padding
991+
fn padding_size(&self) -> u64 {
992+
let bpr = self.blocks_per_record();
993+
ZFS_RECORDSIZE - bpr * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES)
994+
}
906995
}
907996

908997
/// Represents position in a block-context pair.

0 commit comments

Comments
 (0)