diff --git a/sled-agent/src/config.rs b/sled-agent/src/config.rs index b281ebf358a..58e3338c292 100644 --- a/sled-agent/src/config.rs +++ b/sled-agent/src/config.rs @@ -58,10 +58,17 @@ pub struct Config { // TODO: Remove once this can be auto-detected. pub sidecar_revision: SidecarRevision, /// Optional percentage of DRAM to reserve for guest memory - pub vmm_reservoir_percentage: Option, + pub vmm_reservoir_percentage: Option, /// Optional DRAM to reserve for guest memory in MiB (mutually exclusive /// option with vmm_reservoir_percentage). pub vmm_reservoir_size_mb: Option, + /// Amount of memory to set aside in anticipation of use for services that + /// will have roughly constant memory use. These are services that may have + /// zero to one instances on a given sled - internal DNS, MGS, Nexus, + /// ClickHouse, and so on. For a sled that happens to not run these kinds of + /// control plane services, this memory is "wasted", but ensures the sled + /// could run those services if reconfiguration desired it. + pub control_plane_memory_earmark_mb: Option, /// Optional swap device size in GiB pub swap_device_size_gb: Option, /// Optional VLAN ID to be used for tagging guest VNICs. @@ -181,8 +188,8 @@ mod test { let entry = entry.unwrap(); if entry.file_name() == "config.toml" { let path = entry.path(); - Config::from_file(&path).unwrap_or_else(|_| { - panic!("Failed to parse config {path}") + Config::from_file(&path).unwrap_or_else(|e| { + panic!("Failed to parse config {path}: {e}") }); configs_seen += 1; } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index cb61484b629..0c22a9d5ef4 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -69,7 +69,7 @@ use sled_agent_types::zone_bundle::{ PriorityOrder, StorageLimit, ZoneBundleMetadata, }; use sled_diagnostics::{SledDiagnosticsCmdError, SledDiagnosticsCmdOutput}; -use sled_hardware::{HardwareManager, underlay}; +use sled_hardware::{HardwareManager, MemoryReservations, underlay}; use sled_hardware_types::Baseboard; use sled_hardware_types::underlay::BootstrapInterface; use sled_storage::manager::StorageHandle; @@ -495,6 +495,15 @@ impl SledAgent { *sled_address.ip(), ); + // The VMM reservior is configured with respect to what's left after + // accounting for relatively fixed and predictable uses. + // We expect certain amounts of memory to be set aside for kernel, + // buffer, or control plane uses. + let memory_sizes = MemoryReservations::new( + long_running_task_handles.hardware_manager.clone(), + config.control_plane_memory_earmark_mb, + ); + // Configure the VMM reservoir as either a percentage of DRAM or as an // exact size in MiB. let reservoir_mode = ReservoirMode::from_config( @@ -502,11 +511,8 @@ impl SledAgent { config.vmm_reservoir_size_mb, ); - let vmm_reservoir_manager = VmmReservoirManager::spawn( - &log, - long_running_task_handles.hardware_manager.clone(), - reservoir_mode, - ); + let vmm_reservoir_manager = + VmmReservoirManager::spawn(&log, memory_sizes, reservoir_mode); let instances = InstanceManager::new( parent_log.clone(), diff --git a/sled-agent/src/vmm_reservoir.rs b/sled-agent/src/vmm_reservoir.rs index d1f8a892063..ff7131d5f7b 100644 --- a/sled-agent/src/vmm_reservoir.rs +++ b/sled-agent/src/vmm_reservoir.rs @@ -12,7 +12,7 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::thread; use tokio::sync::{broadcast, oneshot}; -use sled_hardware::HardwareManager; +use sled_hardware::MemoryReservations; #[derive(thiserror::Error, Debug)] pub enum Error { @@ -35,7 +35,7 @@ pub enum Error { #[derive(Debug, Clone, Copy)] pub enum ReservoirMode { Size(u32), - Percentage(u8), + Percentage(f32), } impl ReservoirMode { @@ -44,7 +44,7 @@ impl ReservoirMode { /// /// Panic upon invalid configuration pub fn from_config( - percentage: Option, + percentage: Option, size_mb: Option, ) -> Option { match (percentage, size_mb) { @@ -135,6 +135,7 @@ impl VmmReservoirManagerHandle { /// Manage the VMM reservoir in a background thread pub struct VmmReservoirManager { + memory_reservations: MemoryReservations, reservoir_size: Arc, rx: flume::Receiver, size_updated_tx: broadcast::Sender<()>, @@ -146,7 +147,7 @@ pub struct VmmReservoirManager { impl VmmReservoirManager { pub fn spawn( log: &Logger, - hardware_manager: HardwareManager, + memory_reservations: sled_hardware::MemoryReservations, reservoir_mode: Option, ) -> VmmReservoirManagerHandle { let log = log.new(o!("component" => "VmmReservoirManager")); @@ -157,15 +158,15 @@ impl VmmReservoirManager { let (tx, rx) = flume::bounded(0); let reservoir_size = Arc::new(AtomicU64::new(0)); let manager = VmmReservoirManager { + memory_reservations, reservoir_size: reservoir_size.clone(), size_updated_tx: size_updated_tx.clone(), _size_updated_rx, rx, log, }; - let _manager_handle = Arc::new(thread::spawn(move || { - manager.run(hardware_manager, reservoir_mode) - })); + let _manager_handle = + Arc::new(thread::spawn(move || manager.run(reservoir_mode))); VmmReservoirManagerHandle { reservoir_size, tx, @@ -174,23 +175,18 @@ impl VmmReservoirManager { } } - fn run( - self, - hardware_manager: HardwareManager, - reservoir_mode: Option, - ) { + fn run(self, reservoir_mode: Option) { match reservoir_mode { None => warn!(self.log, "Not using VMM reservoir"), Some(ReservoirMode::Size(0)) - | Some(ReservoirMode::Percentage(0)) => { + | Some(ReservoirMode::Percentage(0.0)) => { warn!( self.log, "Not using VMM reservoir (size 0 bytes requested)" ) } Some(mode) => { - if let Err(e) = self.set_reservoir_size(&hardware_manager, mode) - { + if let Err(e) = self.set_reservoir_size(mode) { error!(self.log, "Failed to setup VMM reservoir: {e}"); } } @@ -198,7 +194,7 @@ impl VmmReservoirManager { while let Ok(msg) = self.rx.recv() { let ReservoirManagerMsg::SetReservoirSize { mode, reply_tx } = msg; - match self.set_reservoir_size(&hardware_manager, mode) { + match self.set_reservoir_size(mode) { Ok(()) => { let _ = reply_tx.send(Ok(())); } @@ -213,33 +209,28 @@ impl VmmReservoirManager { /// Sets the VMM reservoir to the requested percentage of usable physical /// RAM or to a size in MiB. Either mode will round down to the nearest /// aligned size required by the control plane. - fn set_reservoir_size( - &self, - hardware: &sled_hardware::HardwareManager, - mode: ReservoirMode, - ) -> Result<(), Error> { - let hardware_physical_ram_bytes = hardware.usable_physical_ram_bytes(); + fn set_reservoir_size(&self, mode: ReservoirMode) -> Result<(), Error> { + let vmm_eligible_memory = self.memory_reservations.vmm_eligible(); let req_bytes = match mode { ReservoirMode::Size(mb) => { let bytes = ByteCount::from_mebibytes_u32(mb).to_bytes(); - if bytes > hardware_physical_ram_bytes { + if bytes > vmm_eligible_memory { return Err(Error::ReservoirConfig(format!( - "cannot specify a reservoir of {bytes} bytes when \ - physical memory is {hardware_physical_ram_bytes} bytes", + "cannot specify a reservoir of {bytes} bytes when the \ + maximum reservoir size is {vmm_eligible_memory} bytes", ))); } bytes } ReservoirMode::Percentage(percent) => { - if !matches!(percent, 1..=99) { + if !matches!(percent, 0.1..100.0) { return Err(Error::ReservoirConfig(format!( "VMM reservoir percentage of {} must be between 0 and \ 100", percent ))); }; - (hardware_physical_ram_bytes as f64 - * (f64::from(percent) / 100.0)) + (vmm_eligible_memory as f64 * (f64::from(percent) / 100.0)) .floor() as u64 } }; @@ -258,15 +249,16 @@ impl VmmReservoirManager { } // The max ByteCount value is i64::MAX, which is ~8 million TiB. - // As this value is either a percentage of DRAM or a size in MiB - // represented as a u32, constructing this should always work. + // As this value is either a percentage of otherwise-unbudgeted DRAM or + // a size in MiB represented as a u32, constructing this should always + // work. let reservoir_size = ByteCount::try_from(req_bytes_aligned).unwrap(); if let ReservoirMode::Percentage(percent) = mode { info!( self.log, - "{}% of {} physical ram = {} bytes)", + "{}% of {} VMM eligible ram = {} bytes)", percent, - hardware_physical_ram_bytes, + vmm_eligible_memory, req_bytes, ); } diff --git a/sled-hardware/src/illumos/mod.rs b/sled-hardware/src/illumos/mod.rs index 06163c3f64b..057db6012b6 100644 --- a/sled-hardware/src/illumos/mod.rs +++ b/sled-hardware/src/illumos/mod.rs @@ -205,6 +205,7 @@ struct HardwareView { disks: HashMap, baseboard: Option, online_processor_count: u32, + usable_physical_pages: u64, usable_physical_ram_bytes: u64, } @@ -220,6 +221,7 @@ impl HardwareView { disks: HashMap::new(), baseboard: None, online_processor_count: sysconf::online_processor_count()?, + usable_physical_pages: sysconf::usable_physical_pages()?, usable_physical_ram_bytes: sysconf::usable_physical_ram_bytes()?, }) } @@ -230,6 +232,7 @@ impl HardwareView { disks: HashMap::new(), baseboard: None, online_processor_count: sysconf::online_processor_count()?, + usable_physical_pages: sysconf::usable_physical_pages()?, usable_physical_ram_bytes: sysconf::usable_physical_ram_bytes()?, }) } @@ -798,6 +801,10 @@ impl HardwareManager { self.inner.lock().unwrap().online_processor_count } + pub fn usable_physical_pages(&self) -> u64 { + self.inner.lock().unwrap().usable_physical_pages + } + pub fn usable_physical_ram_bytes(&self) -> u64 { self.inner.lock().unwrap().usable_physical_ram_bytes } diff --git a/sled-hardware/src/illumos/sysconf.rs b/sled-hardware/src/illumos/sysconf.rs index 984292886f2..e06f17fc051 100644 --- a/sled-hardware/src/illumos/sysconf.rs +++ b/sled-hardware/src/illumos/sysconf.rs @@ -25,14 +25,21 @@ pub fn online_processor_count() -> Result { Ok(u32::try_from(res)?) } -/// Returns the amount of RAM on this sled, in bytes. -pub fn usable_physical_ram_bytes() -> Result { - let phys_pages: u64 = illumos_utils::libc::sysconf(libc::_SC_PHYS_PAGES) +/// Returns the number of physical RAM pages on this sled. +pub fn usable_physical_pages() -> Result { + let pages = illumos_utils::libc::sysconf(libc::_SC_PHYS_PAGES) .map_err(|e| Error::Sysconf { arg: "physical pages", e })? .try_into()?; + Ok(pages) +} + +/// Returns the amount of RAM on this sled, in bytes. +pub fn usable_physical_ram_bytes() -> Result { let page_size: u64 = illumos_utils::libc::sysconf(libc::_SC_PAGESIZE) .map_err(|e| Error::Sysconf { arg: "physical page size", e })? .try_into()?; - Ok(phys_pages * page_size) + // XXX: if we eventually have pages with mixed sizes, this may be wrong! + // I'm not even sure how we'd calculate this in such a world! + Ok(usable_physical_pages()? * page_size) } diff --git a/sled-hardware/src/lib.rs b/sled-hardware/src/lib.rs index d210fbb1ce9..821f97cfd15 100644 --- a/sled-hardware/src/lib.rs +++ b/sled-hardware/src/lib.rs @@ -75,3 +75,63 @@ pub enum SledMode { /// Force sled to run as a Scrimlet Scrimlet { asic: DendriteAsic }, } + +/// Accounting for high watermark memory usage for various system purposes +#[derive(Clone)] +pub struct MemoryReservations { + hardware_manager: HardwareManager, + /// The amount of memory expected to be used if "control plane" services all + /// running on this sled. "control plane" here refers to services that have + /// roughly fixed memory use given differing sled hardware configurations. + /// DNS (internal, external), Nexus, Cockroach, or ClickHouse are all + /// examples of "control plane" here. + /// + /// This is a pessimistic overestimate; it is unlikely + /// (and one might say undesirable) that all such services are colocated on + /// a sled, and (as described in RFD 413) the budgeting for each service's + /// RAM must include headroom for those services potentially forking and + /// bursting required swap or resident pages. + // + // XXX: This is really something we should be told by Neuxs, perhaps after + // starting with this conservative estimate to get the sled started. + control_plane_earmark_bytes: u64, + // XXX: Crucible involves some amount of memory in support of the volumes it + // manages. We should collect zpool size and estimate the memory that would + // be used if all available storage was dedicated to Crucible volumes. For + // now this is part of the control plane earmark. +} + +impl MemoryReservations { + pub fn new( + hardware_manager: HardwareManager, + control_plane_earmark_mib: Option, + ) -> MemoryReservations { + const MIB: u64 = 1024 * 1024; + let control_plane_earmark_bytes = + u64::from(control_plane_earmark_mib.unwrap_or(0)) * MIB; + + Self { hardware_manager, control_plane_earmark_bytes } + } + + /// Compute the amount of physical memory that could be set aside for the + /// VMM reservoir. + /// + /// The actual VMM reservoir will be smaller than this amount, and is either + /// a fixed amount of memory specified by `ReservoirMode::Size` or + /// a percentage of this amount specified by `ReservoirMode::Percentage`. + pub fn vmm_eligible(&self) -> u64 { + let hardware_physical_ram_bytes = + self.hardware_manager.usable_physical_ram_bytes(); + // Don't like hardcoding a struct size from the host OS here like + // this, maybe we shuffle some bits around before merging.. On the + // other hand, the last time page_t changed was illumos-gate commit + // a5652762e5 from 2006. + const PAGE_T_SIZE: u64 = 120; + let max_page_t_space = + self.hardware_manager.usable_physical_pages() * PAGE_T_SIZE; + + hardware_physical_ram_bytes + - max_page_t_space + - self.control_plane_earmark_bytes + } +} diff --git a/sled-hardware/src/non_illumos/mod.rs b/sled-hardware/src/non_illumos/mod.rs index 707ccd8d45f..d03484bbfa1 100644 --- a/sled-hardware/src/non_illumos/mod.rs +++ b/sled-hardware/src/non_illumos/mod.rs @@ -45,6 +45,10 @@ impl HardwareManager { unimplemented!("Accessing hardware unsupported on non-illumos"); } + pub fn usable_physical_pages(&self) -> u64 { + unimplemented!("Accessing hardware unsupported on non-illumos"); + } + pub fn usable_physical_ram_bytes(&self) -> u64 { unimplemented!("Accessing hardware unsupported on non-illumos"); } diff --git a/smf/sled-agent/gimlet-standalone/config.toml b/smf/sled-agent/gimlet-standalone/config.toml index 5ab9d49ede8..fae849739dc 100644 --- a/smf/sled-agent/gimlet-standalone/config.toml +++ b/smf/sled-agent/gimlet-standalone/config.toml @@ -20,7 +20,19 @@ skip_timesync = true # Percentage of usable physical DRAM to use for the VMM reservoir, which # guest memory is pulled from. -vmm_reservoir_percentage = 80 +vmm_reservoir_percentage = 86.3 +# The amount of memory held back for services which exist between zero and one +# on this Gimlet. This currently includes some additional terms reflecting +# OS memory use under load. +# +# As of writing, this is the sum of the following items from RFD 413: +# * Network buffer slush: 18 GiB +# * Other kernel heap: 20 GiB +# * ZFS ARC minimum: 5 GiB +# * Sled agent: 0.5 GiB +# * Maghemite: 0.25 GiB +# * NTP: 0.25 GiB +control_plane_memory_earmark_mb = 45056 # Swap device size for the system. The device is a sparsely allocated zvol on # the internal zpool of the M.2 that we booted from. diff --git a/smf/sled-agent/gimlet/config.toml b/smf/sled-agent/gimlet/config.toml index a6392406895..6290c3180e0 100644 --- a/smf/sled-agent/gimlet/config.toml +++ b/smf/sled-agent/gimlet/config.toml @@ -23,7 +23,19 @@ data_link = "cxgbe0" # Percentage of usable physical DRAM to use for the VMM reservoir, which # guest memory is pulled from. -vmm_reservoir_percentage = 80 +vmm_reservoir_percentage = 86.3 +# The amount of memory held back for services which exist between zero and one +# on this Gimlet. This currently includes some additional terms reflecting +# OS memory use under load. +# +# As of writing, this is the sum of the following items from RFD 413: +# * Network buffer slush: 18 GiB +# * Other kernel heap: 20 GiB +# * ZFS ARC minimum: 5 GiB +# * Sled agent: 0.5 GiB +# * Maghemite: 0.25 GiB +# * NTP: 0.25 GiB +control_plane_memory_earmark_mb = 45056 # Swap device size for the system. The device is a sparsely allocated zvol on # the internal zpool of the M.2 that we booted from. diff --git a/smf/sled-agent/non-gimlet/config.toml b/smf/sled-agent/non-gimlet/config.toml index 18a561d134f..b36f4c9e6e7 100644 --- a/smf/sled-agent/non-gimlet/config.toml +++ b/smf/sled-agent/non-gimlet/config.toml @@ -45,7 +45,24 @@ vdevs = [ # Percentage of usable physical DRAM to use for the VMM reservoir, which # guest memory is pulled from. -vmm_reservoir_percentage = 50 +vmm_reservoir_percentage = 60 + +# The amount of memory held back for services which exist between zero and one +# times on this system. As this is a non-Gimlet target, it is likely that +# services will all cohabitate here, so this is perhaps less overallocation than +# it would be for a Gimlet. This currently includes some additional terms +# reflecting OS memory use under load. +# +# These values are inspired by Gimlet configs but scaled to expected test +# system size - 10gbit network links, 64-128gb of DRAM. Even so, this is likely +# an overestimate: +# * Network buffer slush: 2 GiB +# * Other kernel heap: 2 GiB +# * ZFS ARC minimum: 1 GiB +# * Sled agent: 0.5 GiB +# * Maghemite: 0.25 GiB +# * NTP: 0.25 GiB +control_plane_memory_earmark_mb = 6144 # Optionally you can specify the size of the VMM reservoir in MiB. # Note vmm_reservoir_percentage and vmm_reservoir_size_mb cannot be specified