diff --git a/bin/propolis-server/src/lib/stats/virtual_disk.rs b/bin/propolis-server/src/lib/stats/virtual_disk.rs index a9e4114d4..d37a28f25 100644 --- a/bin/propolis-server/src/lib/stats/virtual_disk.rs +++ b/bin/propolis-server/src/lib/stats/virtual_disk.rs @@ -81,6 +81,10 @@ impl VirtualDiskStats { self.on_write_completion(result, len, duration) } Operation::Flush => self.on_flush_completion(result, duration), + Operation::Discard(..) => { + // Discard is not wired up in backends we care about for now, so + // it can safely be ignored. + } } } diff --git a/lib/propolis/src/block/crucible.rs b/lib/propolis/src/block/crucible.rs index 0061da493..2b41adbb0 100644 --- a/lib/propolis/src/block/crucible.rs +++ b/lib/propolis/src/block/crucible.rs @@ -149,6 +149,7 @@ impl CrucibleBackend { block_size: block_size as u32, total_size: sectors, read_only: opts.read_only.unwrap_or(false), + supports_discard: false, }, skip_flush: opts.skip_flush.unwrap_or(false), }), @@ -198,6 +199,7 @@ impl CrucibleBackend { block_size: block_size as u32, total_size: size / block_size, read_only: opts.read_only.unwrap_or(false), + supports_discard: false, }, skip_flush: opts.skip_flush.unwrap_or(false), }), @@ -312,6 +314,8 @@ pub enum Error { BadGuestRegion, #[error("backend is read-only")] ReadOnly, + #[error("operation not supported")] + Unsupported, #[error("offset or length not multiple of blocksize")] BlocksizeMismatch, @@ -329,6 +333,7 @@ impl From for block::Result { fn from(value: Error) -> Self { match value { Error::ReadOnly => block::Result::ReadOnly, + Error::Unsupported => block::Result::Unsupported, _ => block::Result::Failure, } } @@ -421,6 +426,10 @@ async fn process_request( let _ = block.flush(None).await?; } } + block::Operation::Discard(..) => { + // Crucible does not support discard operations for now + return Err(Error::Unsupported); + } } Ok(()) } diff --git a/lib/propolis/src/block/file.rs b/lib/propolis/src/block/file.rs index aa8b3e99d..feb034ea5 100644 --- a/lib/propolis/src/block/file.rs +++ b/lib/propolis/src/block/file.rs @@ -5,8 +5,6 @@ use std::fs::{metadata, File, OpenOptions}; use std::io::{Error, ErrorKind, Result}; use std::num::NonZeroUsize; -use std::os::raw::c_int; -use std::os::unix::fs::FileTypeExt; use std::os::unix::io::AsRawFd; use std::path::Path; use std::sync::{Arc, Mutex}; @@ -14,7 +12,6 @@ use std::sync::{Arc, Mutex}; use crate::accessors::MemAccessor; use crate::block::{self, DeviceInfo}; use crate::tasks::ThreadGroup; -use crate::util::ioctl; use crate::vmm::{MappingExt, MemCtx}; use anyhow::Context; @@ -22,10 +19,6 @@ use anyhow::Context; // XXX: completely arb for now const MAX_WORKERS: usize = 32; -const DKIOC: i32 = 0x04 << 8; -const DKIOCGETWCE: i32 = DKIOC | 36; -const DKIOCSETWCE: i32 = DKIOC | 37; - pub struct FileBackend { state: Arc, @@ -38,6 +31,7 @@ struct WorkerState { /// Write-Cache-Enable state (if supported) of the underlying device wce_state: Mutex>, + discard_mech: Option, info: block::DeviceInfo, skip_flush: bool, @@ -47,17 +41,18 @@ struct WceState { current: bool, } impl WorkerState { - fn new(fp: File, info: block::DeviceInfo, skip_flush: bool) -> Arc { - let wce_state = match info.read_only { - true => None, - false => get_wce(&fp) - .map(|initial| WceState { initial, current: initial }), - }; - + fn new( + fp: File, + info: block::DeviceInfo, + skip_flush: bool, + wce_state: Option, + discard_mech: Option, + ) -> Arc { let state = WorkerState { attachment: block::BackendAttachment::new(), fp, wce_state: Mutex::new(wce_state), + discard_mech, skip_flush, info, }; @@ -74,6 +69,10 @@ impl WorkerState { req.complete(block::Result::ReadOnly); continue; } + if self.discard_mech.is_none() && req.oper().is_discard() { + req.complete(block::Result::Unsupported); + continue; + } let mem = match acc_mem.access() { Some(m) => m, @@ -121,14 +120,28 @@ impl WorkerState { self.fp.sync_data().map_err(|_| "io error")?; } } + block::Operation::Discard(off, len) => { + if let Some(mech) = self.discard_mech { + dkioc::do_discard(&self.fp, mech, off as u64, len as u64) + .map_err(|_| { + "io error while attempting to free block(s)" + })?; + } else { + unreachable!("handled above in processing_loop()"); + } + } } Ok(()) } fn set_wce(&self, enabled: bool) { + if self.info.read_only { + // Do not needlessly toggle the cache on a read-only disk + return; + } if let Some(state) = self.wce_state.lock().unwrap().as_mut() { if state.current != enabled { - if let Some(new_wce) = set_wce(&self.fp, enabled).ok() { + if let Some(new_wce) = dkioc::set_wce(&self.fp, enabled).ok() { state.current = new_wce; } } @@ -141,7 +154,7 @@ impl Drop for WorkerState { // initially opened it. if let Some(state) = self.wce_state.get_mut().unwrap().as_mut() { if state.current != state.initial { - let _ = set_wce(&self.fp, state.initial); + let _ = dkioc::set_wce(&self.fp, state.initial); } } } @@ -174,17 +187,34 @@ impl FileBackend { let fp = OpenOptions::new().read(true).write(!read_only).open(p)?; let len = fp.metadata().unwrap().len(); - // TODO: attempt to query blocksize from underlying file/zvol + let disk_info = dkioc::disk_info(&fp); + + // Do not use the device-queried block size for now. Guests get upset if + // this changes, and it is likely differen than the old default of 512B let block_size = opts.block_size.unwrap_or(block::DEFAULT_BLOCK_SIZE); let info = block::DeviceInfo { block_size, total_size: len / u64::from(block_size), read_only, + supports_discard: disk_info.discard_mech.is_some(), }; let skip_flush = opts.skip_flush.unwrap_or(false); + let wce_state = if !read_only { + disk_info + .wce_state + .map(|initial| WceState { initial, current: initial }) + } else { + None + }; Ok(Arc::new(Self { - state: WorkerState::new(fp, info, skip_flush), + state: WorkerState::new( + fp, + info, + skip_flush, + wce_state, + disk_info.discard_mech, + ), worker_count, workers: ThreadGroup::new(), })) @@ -238,30 +268,200 @@ impl block::Backend for FileBackend { } } -/// Attempt to query the Write-Cache-Enable state for a given open device -/// -/// Block devices (including "real" disks and zvols) will regard all writes as -/// synchronous when performed via the /dev/rdsk endpoint when WCE is not -/// enabled. With WCE enabled, writes can be cached on the device, to be -/// flushed later via fsync(). -fn get_wce(fp: &File) -> Option { - let ft = fp.metadata().ok()?.file_type(); - if !ft.is_char_device() { - return None; +mod dkioc { + #![allow(non_camel_case_types)] + + use std::fs::File; + use std::io::Result; + use std::os::raw::{c_int, c_longlong, c_uint}; + use std::os::unix::fs::FileTypeExt; + use std::os::unix::io::AsRawFd; + + use crate::block::DEFAULT_BLOCK_SIZE; + use crate::util::ioctl; + + const DKIOC: i32 = 0x04 << 8; + const DKIOCGETWCE: i32 = DKIOC | 36; + const DKIOCSETWCE: i32 = DKIOC | 37; + const DKIOCGMEDIAINFOEXT: i32 = DKIOC | 48; + const DKIOCFREE: i32 = DKIOC | 50; + const DKIOC_CANFREE: i32 = DKIOC | 60; + + #[derive(Copy, Clone)] + pub(crate) enum DiscardMech { + /// Discard via `ioctl(DKIOCFREE)` + DkiocFree, + /// Discard via `fcntl(F_FREESP)` + FnctlFreesp, } - let mut res: c_int = 0; - let _ = unsafe { - ioctl(fp.as_raw_fd(), DKIOCGETWCE, &mut res as *mut c_int as _).ok()? - }; - Some(res != 0) -} + #[derive(Copy, Clone)] + #[allow(unused, dead_code)] + pub(crate) struct DiskInfo { + /// WCE state (if any) for disk + /// + /// Block devices (including "real" disks and zvols) will regard all writes as + /// synchronous when performed via the /dev/rdsk endpoint when WCE is not + /// enabled. With WCE enabled, writes can be cached on the device, to be + /// flushed later via fsync(). + pub wce_state: Option, + /// Block size of disk + pub block_size: u32, + /// Does the disk support use of DKIOCFREE or F_FREESP? + pub discard_mech: Option, + } + impl Default for DiskInfo { + fn default() -> Self { + Self { + wce_state: None, + block_size: DEFAULT_BLOCK_SIZE, + discard_mech: None, + } + } + } + + pub(crate) fn disk_info(fp: &File) -> DiskInfo { + match fp.metadata() { + Ok(ft) if ft.file_type().is_char_device() => { + // Continue on to attempt DKIOC lookups on raw disk devices + } + Ok(ft) if ft.file_type().is_file() => { + // Assume fcntl(F_FREESP) support for files + return DiskInfo { + discard_mech: Some(DiscardMech::FnctlFreesp), + ..Default::default() + }; + } + _ => { + return DiskInfo::default(); + } + } + + let wce_state = unsafe { + let mut res: c_int = 0; + + ioctl(fp.as_raw_fd(), DKIOCGETWCE, &mut res as *mut c_int as _) + .ok() + .map(|_| res != 0) + }; + + let can_free = unsafe { + let mut res: c_int = 0; + ioctl(fp.as_raw_fd(), DKIOC_CANFREE, &mut res as *mut c_int as _) + .ok() + .map(|_| res != 0) + .unwrap_or(false) + }; + + let block_size = unsafe { + let mut info = dk_minfo_ext::default(); + ioctl( + fp.as_raw_fd(), + DKIOCGMEDIAINFOEXT, + &mut info as *mut dk_minfo_ext as _, + ) + .ok() + .map(|_| info.dki_pbsize) + .unwrap_or(DEFAULT_BLOCK_SIZE) + }; + + DiskInfo { + wce_state, + block_size, + discard_mech: can_free.then_some(DiscardMech::DkiocFree), + } + } + + /// Attempt to set the Write-Cache-Enable state for a given open device + pub(crate) fn set_wce(fp: &File, enabled: bool) -> Result { + let mut flag: c_int = enabled.into(); + unsafe { + ioctl(fp.as_raw_fd(), DKIOCSETWCE, &mut flag as *mut c_int as _) + .map(|_| enabled) + } + } + + pub(crate) fn do_discard( + fp: &File, + mech: DiscardMech, + off: u64, + len: u64, + ) -> Result<()> { + match mech { + DiscardMech::DkiocFree => { + let mut req = dkioc_free_list { + dfl_flags: 0, + dfl_num_exts: 1, + dfl_offset: 0, + dfl_exts: [dkioc_free_list_ext { + dfle_start: off, + dfle_length: len, + }], + }; + unsafe { + ioctl( + fp.as_raw_fd(), + DKIOCFREE, + &mut req as *mut dkioc_free_list as _, + )?; + }; + Ok(()) + } + DiscardMech::FnctlFreesp => { + let mut fl = libc::flock { + l_type: libc::F_WRLCK as i16, + l_whence: 0, + l_start: off as i64, + l_len: len as i64, + // Ugly hack to zero out struct members we do not care about + ..unsafe { std::mem::MaybeUninit::zeroed().assume_init() } + }; + + // Make this buildable on non-illumos, despite the F_FREESP + // fnctl command being unavailable elsewhere. + #[cfg(target_os = "illumos")] + let fcntl_cmd = libc::F_FREESP; + #[cfg(not(target_os = "illumos"))] + let fcntl_cmd = -1; + + let res = unsafe { + libc::fcntl( + fp.as_raw_fd(), + fcntl_cmd, + &mut fl as *mut libc::flock as *mut libc::c_void, + ) + }; + if res != 0 { + Err(std::io::Error::last_os_error()) + } else { + Ok(()) + } + } + } + } + + #[derive(Copy, Clone, Default)] + #[repr(C)] + struct dkioc_free_list_ext { + dfle_start: u64, + dfle_length: u64, + } + + #[derive(Copy, Clone, Default)] + #[repr(C)] + struct dkioc_free_list { + dfl_flags: u64, + dfl_num_exts: u64, + dfl_offset: u64, + dfl_exts: [dkioc_free_list_ext; 1], + } -/// Attempt to set the Write-Cache-Enable state for a given open device -fn set_wce(fp: &File, enabled: bool) -> Result { - let mut flag: c_int = enabled.into(); - unsafe { - ioctl(fp.as_raw_fd(), DKIOCSETWCE, &mut flag as *mut c_int as _) - .map(|_| enabled) + #[derive(Copy, Clone, Default)] + #[repr(C)] + struct dk_minfo_ext { + dki_media_type: c_uint, + dki_lbsize: c_uint, + dki_capacity: c_longlong, + dki_pbsize: c_uint, } } diff --git a/lib/propolis/src/block/in_memory.rs b/lib/propolis/src/block/in_memory.rs index 0eb6c2dad..59645ab48 100644 --- a/lib/propolis/src/block/in_memory.rs +++ b/lib/propolis/src/block/in_memory.rs @@ -36,6 +36,11 @@ impl WorkingState { req.complete(block::Result::ReadOnly); continue; } + if req.op.is_discard() { + // Punt on discard support + req.complete(block::Result::Unsupported); + continue; + } let mem = match acc_mem.access() { Some(m) => m, @@ -84,6 +89,9 @@ impl WorkingState { block::Operation::Flush => { // nothing to do } + block::Operation::Discard(..) => { + unreachable!("handled in processing_loop()"); + } } Ok(()) @@ -119,6 +127,7 @@ impl InMemoryBackend { block_size, total_size: len as u64 / u64::from(block_size), read_only: opts.read_only.unwrap_or(false), + supports_discard: false, }, }), worker_count, diff --git a/lib/propolis/src/block/mem_async.rs b/lib/propolis/src/block/mem_async.rs index ff733c56e..1d0f3c53e 100644 --- a/lib/propolis/src/block/mem_async.rs +++ b/lib/propolis/src/block/mem_async.rs @@ -42,6 +42,10 @@ impl WorkingState { req.complete(block::Result::ReadOnly); continue; } + if req.oper().is_discard() { + req.complete(block::Result::Unsupported); + continue; + } let res = match acc_mem .access() .and_then(|mem| self.process_request(&req, &mem).ok()) @@ -97,6 +101,9 @@ impl WorkingState { block::Operation::Flush => { // nothing to do } + block::Operation::Discard(..) => { + unreachable!("handled in processing_loop()") + } } Ok(()) @@ -132,6 +139,7 @@ impl MemAsyncBackend { block_size, total_size: size / u64::from(block_size), read_only: opts.read_only.unwrap_or(false), + supports_discard: false, }, seg, }), diff --git a/lib/propolis/src/block/mod.rs b/lib/propolis/src/block/mod.rs index cd210ed25..96745beda 100644 --- a/lib/propolis/src/block/mod.rs +++ b/lib/propolis/src/block/mod.rs @@ -43,6 +43,7 @@ mod probes { fn block_begin_read(dev_id: u64, req_id: u64, offset: u64, len: u64) {} fn block_begin_write(dev_id: u64, req_id: u64, offset: u64, len: u64) {} fn block_begin_flush(dev_id: u64, req_id: u64) {} + fn block_begin_discard(dev_id: u64, req_id: u64, offset: u64, len: u64) {} fn block_complete_read( dev_id: u64, @@ -68,6 +69,14 @@ mod probes { queue_ns: u64, ) { } + fn block_complete_discard( + dev_id: u64, + req_id: u64, + result: u8, + proc_ns: u64, + queue_ns: u64, + ) { + } } /// Type of operations which may be issued to a virtual block device. @@ -79,6 +88,8 @@ pub enum Operation { Write(ByteOffset, ByteLen), /// Flush buffer(s) Flush, + /// Discard/UNMAP/deallocate region + Discard(ByteOffset, ByteLen), } impl Operation { pub const fn is_read(&self) -> bool { @@ -90,6 +101,9 @@ impl Operation { pub const fn is_flush(&self) -> bool { matches!(self, Operation::Flush) } + pub const fn is_discard(&self) -> bool { + matches!(self, Operation::Discard(..)) + } } /// Result of a block [`Request`] @@ -147,6 +161,11 @@ impl Request { Self { op, regions: Vec::new(), marker: None } } + pub fn new_discard(off: ByteOffset, len: ByteLen) -> Self { + let op = Operation::Discard(off, len); + Self { op, regions: Vec::new(), marker: None } + } + /// Type of operation being issued. pub fn oper(&self) -> Operation { self.op @@ -165,7 +184,7 @@ impl Request { Operation::Write(..) => { self.regions.iter().map(|r| mem.readable_region(r)).collect() } - Operation::Flush => None, + Operation::Flush | Operation::Discard(..) => None, } } @@ -193,6 +212,8 @@ pub struct DeviceInfo { pub total_size: u64, /// Is the device read-only pub read_only: bool, + /// Does the device support discard/UNMAP + pub supports_discard: bool, } /// Options to control behavior of block backend. diff --git a/lib/propolis/src/block/tracking.rs b/lib/propolis/src/block/tracking.rs index 437a034cb..7e7890678 100644 --- a/lib/propolis/src/block/tracking.rs +++ b/lib/propolis/src/block/tracking.rs @@ -131,6 +131,11 @@ impl Tracking { Operation::Flush => { probes::block_begin_flush!(|| { (devid, id) }); } + Operation::Discard(off, len) => { + probes::block_begin_discard!(|| { + (devid, id, off as u64, len as u64) + }); + } } req @@ -169,6 +174,11 @@ impl Tracking { (devid, id, rescode, proc_ns, queue_ns) }); } + Operation::Discard(..) => { + probes::block_complete_discard!(|| { + (devid, id, rescode, proc_ns, queue_ns) + }); + } } if let Some(cb) = guard.on_completion.as_ref() { diff --git a/lib/propolis/src/hw/nvme/requests.rs b/lib/propolis/src/hw/nvme/requests.rs index 0ff12a82f..09d341d8e 100644 --- a/lib/propolis/src/hw/nvme/requests.rs +++ b/lib/propolis/src/hw/nvme/requests.rs @@ -173,6 +173,9 @@ impl PciNvme { Operation::Flush => { probes::nvme_flush_complete!(|| (qid, cid, resnum)); } + Operation::Discard(..) => { + unreachable!("discard not supported in NVMe for now"); + } } let guard = self.mem_access(); diff --git a/lib/propolis/src/hw/virtio/block.rs b/lib/propolis/src/hw/virtio/block.rs index 79d3076c5..cf468499a 100644 --- a/lib/propolis/src/hw/virtio/block.rs +++ b/lib/propolis/src/hw/virtio/block.rs @@ -25,6 +25,9 @@ use lazy_static::lazy_static; /// Sizing for virtio-block is specified in 512B sectors const SECTOR_SZ: usize = 512; +/// Arbitrary limit to sectors permitted per discard request +const MAX_DISCARD_SECTORS: u32 = ((1024 * 1024) / SECTOR_SZ) as u32; + struct CompletionPayload { /// ID of original request. rid: u16, @@ -84,6 +87,27 @@ impl PciVirtioBlock { BlockReg::Unused => { ro.fill(0); } + BlockReg::MaxDiscardSectors => { + // Arbitrarily limit to 1MiB (or the device size, if smaller) + let sz = u32::min( + MAX_DISCARD_SECTORS, + (info.total_size / SECTOR_SZ as u64) as u32, + ); + ro.write_u32(if info.supports_discard { sz } else { 0 }); + } + BlockReg::MaxDiscardSeg => { + // If the device supports discard operations, only permit one + // segment (LBA/size) per request. + ro.write_u32(if info.supports_discard { 1 } else { 0 }); + } + BlockReg::DiscardSectorAlign => { + // Expect that discard operations are block-aligned + ro.write_u32(if info.supports_discard { + info.block_size / SECTOR_SZ as u32 + } else { + 0 + }); + } _ => { // XXX: all zeroes for now ro.fill(0); @@ -150,6 +174,22 @@ impl PciVirtioBlock { CompletionPayload { rid, chain }, )) } + VIRTIO_BLK_T_DISCARD => { + let mut detail = DiscardWriteZeroes::default(); + if !chain.read(&mut detail, &mem) { + Err(chain) + } else { + let off = detail.sector as usize * SECTOR_SZ; + let sz = detail.num_sectors as usize * SECTOR_SZ; + probes::vioblk_discard_enqueue!(|| ( + rid, off as u64, sz as u64, + )); + Ok(self.block_tracking.track( + block::Request::new_discard(off, sz), + CompletionPayload { rid, chain }, + )) + } + } _ => Err(chain), }; match req { @@ -192,6 +232,9 @@ impl PciVirtioBlock { block::Operation::Flush => { probes::vioblk_flush_complete!(|| (rid, resnum)); } + block::Operation::Discard(..) => { + probes::vioblk_discard_complete!(|| (rid, resnum)); + } } chain.write(&resnum, &mem); vq.push_used(chain, &mem); @@ -217,6 +260,9 @@ impl VirtioDevice for PciVirtioBlock { if info.read_only { feat |= VIRTIO_BLK_F_RO; } + if info.supports_discard { + feat |= VIRTIO_BLK_F_DISCARD; + } feat } fn set_features(&self, _feat: u32) -> Result<(), ()> { @@ -305,6 +351,14 @@ struct VbReq { sector: u64, } +#[derive(Copy, Clone, Debug, Default)] +#[repr(C)] +struct DiscardWriteZeroes { + sector: u64, + num_sectors: u32, + flags: u32, +} + #[derive(Copy, Clone, Eq, PartialEq, Debug)] enum BlockReg { Capacity, @@ -385,4 +439,7 @@ mod probes { fn vioblk_flush_enqueue(id: u16) {} fn vioblk_flush_complete(id: u16, res: u8) {} + + fn vioblk_discard_enqueue(id: u16, off: u64, sz: u64) {} + fn vioblk_discard_complete(id: u16, res: u8) {} }