Skip to content

Commit 97e2319

Browse files
authored
Use uninitialized memory when doing reads (#1287)
While fixing #1286, I noticed that 8% of on-CPU time being spend doing `memset` in `read_into` (see PR for flamegraph) This is unnecessary work: the buffer has sufficient capacity, and we're about to overwrite that region of the buffer. In this PR, we switch to using `libc::pread` into the uninitialized portion of the buffer, then manually set the length with `set_len`. Sure enough, this removes the `memset` from the flamegraph. Note that the new code is identical between `extent_inner_sqlite.rs` and `extent_inner_raw.rs`, because they have the same data format; since we're planning to remove SQLite in #1267 , I don't feel the need to refactor into helper functions. There's also a drive-by fix revealed by the new assertion: we need to reattach the remaining `buf` to `out.data` (by calling `unsplit(..)`); otherwise, it will force a reallocation. This will only trigger for reads which span multiple extents, so it's probably not a big deal in the wild, but let's do the correct thing instead of the wrong thing.
1 parent 621f250 commit 97e2319

File tree

2 files changed

+62
-25
lines changed

2 files changed

+62
-25
lines changed

downstairs/src/extent_inner_raw.rs

+25-14
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use slog::{error, Logger};
2121
use std::collections::HashSet;
2222
use std::fs::{File, OpenOptions};
2323
use std::io::{BufReader, Read};
24-
use std::os::fd::AsFd;
24+
use std::os::fd::{AsFd, AsRawFd};
2525
use std::path::Path;
2626

2727
/// Equivalent to `DownstairsBlockContext`, but without one's own block number
@@ -219,11 +219,11 @@ impl ExtentInner for RawInner {
219219
out.blocks.push(resp);
220220
}
221221

222-
// Calculate the number of expected bytes, then resize our buffer
223-
//
224-
// This should fill memory, but should not reallocate
222+
// To avoid a `memset`, we're reading directly into uninitialized
223+
// memory in the buffer. This is fine; we sized the buffer
224+
// appropriately in advance (and will panic here if we messed up).
225225
let expected_bytes = n_contiguous_blocks * block_size as usize;
226-
buf.resize(expected_bytes, 1);
226+
assert!(buf.spare_capacity_mut().len() >= expected_bytes);
227227

228228
let first_resp = &out.blocks[resp_run_start];
229229
check_input(self.extent_size, first_resp.offset, &buf)?;
@@ -233,15 +233,21 @@ impl ExtentInner for RawInner {
233233
(job_id.0, self.extent_number, n_contiguous_blocks as u64)
234234
});
235235

236-
// Perform the bulk read, then check against the expected number of
237-
// bytes. We could do more robust error handling here (e.g.
238-
// retrying in a loop), but for now, simply bailing out seems wise.
239-
let num_bytes = nix::sys::uio::pread(
240-
self.file.as_fd(),
241-
&mut buf,
242-
first_req.offset.value as i64 * block_size as i64,
243-
)
244-
.map_err(|e| {
236+
// SAFETY: the buffer has sufficient capacity, and this is a valid
237+
// file descriptor.
238+
let r = unsafe {
239+
libc::pread(
240+
self.file.as_raw_fd(),
241+
buf.spare_capacity_mut().as_mut_ptr() as *mut libc::c_void,
242+
expected_bytes as libc::size_t,
243+
first_req.offset.value as i64 * block_size as i64,
244+
)
245+
};
246+
// Check against the expected number of bytes. We could do more
247+
// robust error handling here (e.g. retrying in a loop), but for
248+
// now, simply bailing out seems wise.
249+
let r = nix::errno::Errno::result(r).map(|r| r as usize);
250+
let num_bytes = r.map_err(|e| {
245251
CrucibleError::IoError(format!(
246252
"extent {}: read failed: {e}",
247253
self.extent_number
@@ -254,6 +260,10 @@ impl ExtentInner for RawInner {
254260
self.extent_number
255261
)));
256262
}
263+
// SAFETY: we just initialized this chunk of the buffer
264+
unsafe {
265+
buf.set_len(expected_bytes);
266+
}
257267

258268
cdt::extent__read__file__done!(|| {
259269
(job_id.0, self.extent_number, n_contiguous_blocks as u64)
@@ -295,6 +305,7 @@ impl ExtentInner for RawInner {
295305

296306
req_run_start += n_contiguous_blocks;
297307
}
308+
out.data.unsplit(buf);
298309
Ok(())
299310
}
300311

downstairs/src/extent_inner_sqlite.rs

+37-11
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use slog::{error, Logger};
1717
use std::collections::{BTreeMap, HashSet};
1818
use std::fs::{File, OpenOptions};
1919
use std::io::{BufReader, Read, Seek, SeekFrom};
20-
use std::os::fd::AsFd;
20+
use std::os::fd::{AsFd, AsRawFd};
2121
use std::path::Path;
2222

2323
#[derive(Debug)]
@@ -328,11 +328,11 @@ impl SqliteMoreInner {
328328
out.blocks.push(resp);
329329
}
330330

331-
// Calculate the number of expected bytes, then resize our buffer
332-
//
333-
// This should fill memory, but should not reallocate
331+
// To avoid a `memset`, we're reading directly into uninitialized
332+
// memory in the buffer. This is fine; we sized the buffer
333+
// appropriately in advance (and will panic here if we messed up).
334334
let expected_bytes = n_contiguous_blocks * block_size as usize;
335-
buf.resize(expected_bytes, 1);
335+
assert!(buf.spare_capacity_mut().len() >= expected_bytes);
336336

337337
let first_resp = &out.blocks[resp_run_start];
338338
check_input(self.extent_size, first_resp.offset, &buf)?;
@@ -342,12 +342,37 @@ impl SqliteMoreInner {
342342
(job_id.0, self.extent_number, n_contiguous_blocks as u64)
343343
});
344344

345-
nix::sys::uio::pread(
346-
self.file.as_fd(),
347-
&mut buf,
348-
first_req.offset.value as i64 * block_size as i64,
349-
)
350-
.map_err(|e| CrucibleError::IoError(e.to_string()))?;
345+
// SAFETY: the buffer has sufficient capacity, and this is a valid
346+
// file descriptor.
347+
let r = unsafe {
348+
libc::pread(
349+
self.file.as_raw_fd(),
350+
buf.spare_capacity_mut().as_mut_ptr() as *mut libc::c_void,
351+
expected_bytes as libc::size_t,
352+
first_req.offset.value as i64 * block_size as i64,
353+
)
354+
};
355+
// Check against the expected number of bytes. We could do more
356+
// robust error handling here (e.g. retrying in a loop), but for
357+
// now, simply bailing out seems wise.
358+
let r = nix::errno::Errno::result(r).map(|r| r as usize);
359+
let num_bytes = r.map_err(|e| {
360+
CrucibleError::IoError(format!(
361+
"extent {}: read failed: {e}",
362+
self.extent_number
363+
))
364+
})?;
365+
if num_bytes != expected_bytes {
366+
return Err(CrucibleError::IoError(format!(
367+
"extent {}: incomplete read \
368+
(expected {expected_bytes}, got {num_bytes})",
369+
self.extent_number
370+
)));
371+
}
372+
// SAFETY: we just initialized this chunk of the buffer
373+
unsafe {
374+
buf.set_len(expected_bytes);
375+
}
351376

352377
cdt::extent__read__file__done!(|| {
353378
(job_id.0, self.extent_number, n_contiguous_blocks as u64)
@@ -388,6 +413,7 @@ impl SqliteMoreInner {
388413

389414
req_run_start += n_contiguous_blocks;
390415
}
416+
out.data.unsplit(buf);
391417
Ok(())
392418
}
393419

0 commit comments

Comments
 (0)